diff --git a/tools/fetch.py b/tools/fetch.py
index 9c5c588..4aa8358 100644
--- a/tools/fetch.py
+++ b/tools/fetch.py
@@ -9,9 +9,70 @@ from bs4 import BeautifulSoup
from urllib.parse import urlparse
def reader_mode(html_content: str) -> str:
+ print("Getting reader mode for HTML", html_content)
+
doc = Document(html_content)
+ short_title = doc.short_title() or ""
+ title_html = f"
{html.escape(short_title)}
\n" if short_title else ""
+
+ soup = BeautifulSoup(html_content, "html.parser")
+
+ def pick_largest(elements):
+ largest = None
+ max_len = 0
+ for el in elements:
+ text_len = len(el.get_text(separator=" ", strip=True))
+ if text_len > max_len:
+ largest = el
+ max_len = text_len
+ return largest
+
+ candidate = None
+
+ # Prefer
+ mains = soup.find_all("main")
+ if mains:
+ candidate = pick_largest(mains)
+
+ # Then [role="main"]
+ if not candidate:
+ role_mains = soup.select('[role="main"]')
+ if role_mains:
+ candidate = pick_largest(role_mains)
+
+ # Then (prefer itemprop="articleBody" if present)
+ if not candidate:
+ articles = soup.find_all("article")
+ if articles:
+ bodies = []
+ for art in articles:
+ bodies.extend(art.select('[itemprop="articleBody"]'))
+ candidate = pick_largest(bodies) or pick_largest(articles)
+
+ if candidate:
+ # Remove non-content elements inside the candidate
+ for tag in candidate.find_all(["script", "style", "noscript", "svg", "canvas", "form"]):
+ tag.decompose()
+ for tag in candidate.find_all(["header", "footer", "nav", "aside"]):
+ tag.decompose()
+
+ # Normalize lazy-loaded images
+ for img in candidate.find_all("img"):
+ src = img.get("src")
+ if not src or src.startswith("data:"):
+ for attr in ["data-src", "data-original", "data-lazy-src"]:
+ val = img.get(attr)
+ if val:
+ img["src"] = val
+ break
+ if not img.get("srcset") and img.get("data-srcset"):
+ img["srcset"] = img.get("data-srcset")
+
+ return title_html + str(candidate)
+
+ # Fallback to Readability result
body = BeautifulSoup(doc.summary(), "html.parser")
- return f"{doc.short_title()}
\n{body}"
+ return title_html + str(body)
def is_reddit_url(url: str) -> bool:
"""Check if URL is a Reddit link"""