diff --git a/tools/fetch.py b/tools/fetch.py index 87c35ea..583f543 100644 --- a/tools/fetch.py +++ b/tools/fetch.py @@ -135,28 +135,266 @@ def generate_archive_header(url: str, archive_date: datetime.datetime) -> str:
''' +def is_reddit_search_tool(url: str) -> bool: + """Check if URL is from the Reddit search tool (ihsoyct.github.io)""" + parsed = urlparse(url) + return 'ihsoyct.github.io' in parsed.netloc + +def archive_reddit_search_tool(url: str) -> str: + """Archive Reddit search tool results as minimal Markdown""" + try: + headers = {"User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)"} + response = requests.get(url, timeout=30, headers=headers) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + + # Extract search parameters for simple title + parsed_url = urlparse(url) + params = dict(param.split('=') for param in parsed_url.query.split('&') if '=' in param) + + # Simple title + if params.get('author'): + title = f"Comments by u/{params['author']}" + elif params.get('subreddit'): + title = f"r/{params['subreddit']} comments" + else: + title = "Reddit Comments" + + if params.get('body'): + title += f" containing '{params['body']}'" + + # Start with minimal content + md_content = f"# {title}\n\n" + + # Find the submission div and all posts within it + submission_div = soup.find('div', id='submission') + if not submission_div: + md_content += "No submissions found.\n" + return md_content + + posts = submission_div.find_all('div', class_='post') + + if not posts: + md_content += "No posts found.\n" + return md_content + + for post in posts: + # Extract comment path and create Reddit URL + title_elem = post.find('p', class_='comment_title') + if title_elem: + comment_path = title_elem.get_text().strip() + reddit_url = f"https://reddit.com{comment_path}" + md_content += f"**{comment_path}**\n" + md_content += f"{reddit_url}\n\n" + + # Extract user and score info (simplified) + user_elem = post.find('p', class_='comment_user') + if user_elem: + user_text = user_elem.get_text().strip() + # Extract just username and score + import re + score_match = re.search(r'Score: (\d+)', user_text) + user_match = re.search(r'(u/\w+)', user_text) + date_match = re.search(r'at (.+)$', user_text) + + if user_match and score_match: + user_info = f"{user_match.group(1)} • {score_match.group(1)} points" + if date_match: + user_info += f" • {date_match.group(1)}" + md_content += f"{user_info}\n\n" + + # Extract actual comment content + # Find all p tags that are not comment_title, comment_user, and not empty + content_paragraphs = [] + + # Get all p elements in the post + all_p_tags = post.find_all('p') + + for p in all_p_tags: + # Skip the title and user info paragraphs + if p.get('class') and ('comment_title' in p.get('class') or 'comment_user' in p.get('class')): + continue + + # Get the text content + text = p.get_text().strip() + + # Only add non-empty paragraphs + if text: + content_paragraphs.append(text) + + # Add the comment content + if content_paragraphs: + for para in content_paragraphs: + md_content += f"{para}\n\n" + + md_content += "---\n\n" + + return md_content + + except Exception as e: + print(f"⚠ Reddit search tool archiving failed ({e})") + # Fallback to regular reader mode + response = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)"}) + return reader_mode(response.text) + +def generate_markdown_archive_header(url: str, archive_date: datetime.datetime) -> str: + """Generate minimal archive header in Markdown format""" + formatted_date = archive_date.strftime('%Y-%m-%d %H:%M UTC') + return f"*Archived {formatted_date} from {url}*\n\n" + +def is_arctic_shift_api(url: str) -> bool: + """Check if URL is from the Arctic Shift API""" + parsed = urlparse(url) + return 'arctic-shift.photon-reddit.com' in parsed.netloc and '/api/' in parsed.path + +def archive_arctic_shift_api(url: str) -> str: + """Archive Arctic Shift API results as minimal Markdown""" + try: + headers = {"User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)"} + response = requests.get(url, timeout=30, headers=headers) + response.raise_for_status() + + data = response.json() + comments = data.get('data', []) + + if not comments: + return "# Reddit Comments\n\nNo comments found.\n" + + # Extract search info from URL for title + parsed_url = urlparse(url) + query_params = {} + if parsed_url.query: + for param in parsed_url.query.split('&'): + if '=' in param: + key, value = param.split('=', 1) + query_params[key] = value + + # Build title + title_parts = [] + if query_params.get('author'): + title_parts.append(f"u/{query_params['author']}") + if query_params.get('subreddit'): + title_parts.append(f"r/{query_params['subreddit']}") + if query_params.get('body'): + title_parts.append(f"containing '{query_params['body']}'") + + title = "Comments by " + " • ".join(title_parts) if title_parts else "Reddit Comments" + + md_content = f"# {title}\n\n" + + for comment in comments: + # Extract comment info + permalink = comment.get('permalink', '') + reddit_url = f"https://reddit.com{permalink}" + author = comment.get('author', 'unknown') + score = comment.get('score', 0) + subreddit = comment.get('subreddit', '') + body = comment.get('body', '') + + # Convert timestamp to readable date + created_utc = comment.get('created_utc') + date_str = '' + if created_utc: + import datetime + date_obj = datetime.datetime.fromtimestamp(created_utc, tz=datetime.timezone.utc) + date_str = date_obj.strftime('%Y-%m-%d %H:%M UTC') + + # Format the comment + md_content += f"**{permalink}**\n" + md_content += f"{reddit_url}\n\n" + + # User info line + user_info = f"u/{author} • {score} points" + if date_str: + user_info += f" • {date_str}" + if subreddit: + user_info += f" • r/{subreddit}" + md_content += f"{user_info}\n\n" + + # Comment body (handle newlines properly) + if body: + # Replace \n with actual newlines and clean up + clean_body = body.replace('\\n', '\n').strip() + md_content += f"{clean_body}\n\n" + + md_content += "---\n\n" + + return md_content + + except Exception as e: + print(f"⚠ Arctic Shift API archiving failed ({e})") + return f"# Error\n\nFailed to archive API response: {e}\n" + +def convert_ihsoyct_to_api_url(url: str) -> str: + """Convert ihsoyct.github.io URL to Arctic Shift API URL""" + try: + parsed = urlparse(url) + + # Extract query parameters + params = {} + if parsed.query: + for param in parsed.query.split('&'): + if '=' in param: + key, value = param.split('=', 1) + params[key] = value + + # Build API URL + api_base = "https://arctic-shift.photon-reddit.com/api" + + # Determine endpoint based on mode + mode = params.get('mode', 'comments') + if mode == 'submissions': + endpoint = f"{api_base}/submissions/search" + else: + endpoint = f"{api_base}/comments/search" + + # Build query string for API + api_params = [] + for key, value in params.items(): + if key in ['author', 'subreddit', 'body', 'title', 'selftext', 'limit', 'sort', 'after', 'before']: + api_params.append(f"{key}={value}") + + api_url = f"{endpoint}?{'&'.join(api_params)}" + return api_url + + except Exception as e: + print(f"⚠ Failed to convert URL: {e}") + return url + def archive(url: str, out_dir: pathlib.Path, force: bool): out_dir.mkdir(parents=True, exist_ok=True) fname = out_dir / slug.slug(url) + + # Check if this is a Reddit search tool and convert to API URL + original_url = url + if is_reddit_search_tool(url): + print("🔄 Converting to Arctic Shift API URL...") + url = convert_ihsoyct_to_api_url(url) + print(f" API URL: {url}") + + # Check for API URL and change extension to .md + is_api_url = is_arctic_shift_api(url) + if is_api_url or is_reddit_search_tool(original_url): + fname = fname.with_suffix('.md') + if fname.exists() and not force: - print(f"✓ cached: {url}") + print(f"✓ cached: {original_url}") return - print(f"↓ fetching: {url}") + print(f"↓ fetching: {original_url}") try: archive_date = datetime.datetime.now(datetime.timezone.utc) - if is_reddit_url(url): + if is_arctic_shift_api(url): + content = archive_arctic_shift_api(url) + # For markdown, just add header and content + final_content = generate_markdown_archive_header(original_url, archive_date) + content + elif is_reddit_url(url): content = archive_reddit(url) - else: - html_response = requests.get(url, timeout=30, headers={ - "User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)" - }).text - content = reader_mode(html_response) - - # Enhanced styling with archive header - archive_style = """ + # Enhanced styling with archive header for HTML + archive_style = """ """ + final_content = ( + "\n" + + "\n" + + archive_style + "\n" + + generate_archive_header(url, archive_date) + "\n" + + content + ) + else: + html_response = requests.get(url, timeout=30, headers={ + "User-Agent": "Mozilla/5.0 (ArchiveBot/1.0)" + }).text + content = reader_mode(html_response) + # Enhanced styling with archive header for HTML + archive_style = """ + + """ + final_content = ( + "\n" + + "\n" + + archive_style + "\n" + + generate_archive_header(url, archive_date) + "\n" + + content + ) - fname.write_text( - "\n" + - "\n" + - archive_style + "\n" + - generate_archive_header(url, archive_date) + "\n" + - content, - encoding="utf-8" - ) + fname.write_text(final_content, encoding="utf-8") print(f"✓ saved : {fname.relative_to(out_dir.parent)}") except Exception as e: - print(f"✗ failed : {url} - {e}") + print(f"✗ failed : {original_url} - {e}") if __name__ == "__main__": ap = argparse.ArgumentParser()