diff --git a/tools/fetch.py b/tools/fetch.py index 15aa520..adb8b1e 100644 --- a/tools/fetch.py +++ b/tools/fetch.py @@ -72,6 +72,80 @@ def reader_mode(html_content: str) -> str: body = BeautifulSoup(doc.summary(), "html.parser") return title_html + str(body) +def is_twitter_url(url: str) -> bool: + """Check if URL is a Twitter/X link""" + parsed = urlparse(url) + return parsed.netloc in ('x.com', 'www.x.com', 'twitter.com', 'www.twitter.com', 'mobile.twitter.com') + +def clean_twitter_url(url: str) -> str: + """Strip tracking query params from Twitter/X URLs""" + parsed = urlparse(url) + return f"{parsed.scheme}://{parsed.netloc}{parsed.path}" + +def archive_twitter(url: str) -> str: + """Archive Twitter/X post using fxtwitter API proxy""" + try: + parsed = urlparse(url) + # Convert x.com/user/status/123 -> api.fxtwitter.com/user/status/123 + api_url = f"https://api.fxtwitter.com{parsed.path}" + headers = {"User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)"} + + response = requests.get(api_url, timeout=30, headers=headers) + response.raise_for_status() + data = response.json() + + tweet = data.get('tweet', {}) + author = tweet.get('author', {}) + text = tweet.get('text', '') + author_name = author.get('name', '') + screen_name = author.get('screen_name', '') + likes = tweet.get('likes', 0) + retweets = tweet.get('retweets', 0) + replies = tweet.get('replies', 0) + views = tweet.get('views', 0) + created_at = tweet.get('created_at', '') + + # Build clean HTML + html_content = f'
(Video - see original tweet)
' + html_content += f'{html.escape(q_author.get("name", ""))} (@{html.escape(q_author.get("screen_name", ""))})\n' + + return html_content + + except Exception as e: + print(f"⚠ Twitter API proxy failed ({e}), returning minimal content...") + return f"
' + html_content += f'{q_text}
Failed to fetch tweet content: {html.escape(str(e))}
" + def is_reddit_url(url: str) -> bool: """Check if URL is a Reddit link""" parsed = urlparse(url) @@ -427,8 +501,13 @@ def convert_ihsoyct_to_api_url(url: str) -> str: def archive(url: str, out_dir: pathlib.Path, force: bool): out_dir.mkdir(parents=True, exist_ok=True) + + # Clean Twitter URLs before slugging so filenames match the JS slug + if is_twitter_url(url): + url = clean_twitter_url(url) + fname = out_dir / slug.slug(url) - + # Check if this is a Reddit search tool and convert to API URL original_url = url if is_reddit_search_tool(url): @@ -450,7 +529,42 @@ def archive(url: str, out_dir: pathlib.Path, force: bool): try: archive_date = datetime.datetime.now(datetime.timezone.utc) - if is_arctic_shift_api(url): + if is_twitter_url(original_url): + content = archive_twitter(original_url) + archive_style = """ + + """ + final_content = ( + "\n" + + "