Spaces:

AliHashir
/

ai_for_all

Sleeping

App Files Files Community

AliHashir commited on Aug 30, 2025

Commit

2d6a8ea

1 Parent(s): 3721c3d

feat: add URL fetching and content extraction functionality

Browse files

Files changed (3) hide show

app/fetch/fetcher.py +135 -1
app/main.py +8 -0
requirements.txt +2 -0

app/fetch/fetcher.py CHANGED Viewed

	@@ -1 +1,135 @@
1	- ~~"""Web~~ ~~content fetching and text extraction~~.~~"""~~

+# app/fetch/fetcher.py
+from __future__ import annotations
+import re
+from typing import List, Optional
+from urllib.parse import urlparse
+import httpx
+from bs4 import BeautifulSoup
+from readability import Document
+import trafilatura
+USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/120.0.0.0 Safari/537.36"
+)
+HEADERS = {
+    "User-Agent": USER_AGENT,
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en;q=0.9",
+}
+TIMEOUT = httpx.Timeout(10.0, connect=5.0)
+BLOCKED_SCHEMES = {"javascript", "data"}
+BLOCKED_EXTS = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg"}
+def _looks_blocked(url: str) -> bool:
+    try:
+        p = urlparse(url)
+        if p.scheme in BLOCKED_SCHEMES:
+            return True
+        for ext in BLOCKED_EXTS:
+            if p.path.lower().endswith(ext):
+                return True
+    except Exception:
+        return True
+    return False
+async def fetch_html(url: str) -> Optional[str]:
+    if _looks_blocked(url):
+        return None
+    async with httpx.AsyncClient(headers=HEADERS, timeout=TIMEOUT, follow_redirects=True) as client:
+        resp = await client.get(url)
+        ct = resp.headers.get("Content-Type", "")
+        if "text/html" not in ct and "application/xhtml+xml" not in ct:
+            return None
+        resp.raise_for_status()
+        return resp.text
+def _clean_text(txt: str) -> str:
+    txt = re.sub(r"\r\n|\r", "\n", txt)
+    txt = re.sub(r"[ \t]+", " ", txt)
+    txt = re.sub(r"\n{3,}", "\n\n", txt)
+    return txt.strip()
+def _bs4_text(html: str) -> str:
+    soup = BeautifulSoup(html, "lxml")
+    # Drop nav, footer, script, style
+    for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
+        tag.decompose()
+    return _clean_text(soup.get_text("\n"))
+def extract_main_text(html: str, base_url: str | None = None) -> Optional[str]:
+    # Try trafilatura first
+    try:
+        txt = trafilatura.extract(html, url=base_url, include_comments=False, include_tables=False)
+        if txt and len(txt) >= 400:
+            return _clean_text(txt)
+    except Exception:
+        pass
+    # Fallback to readability-lxml
+    try:
+        doc = Document(html)
+        summary_html = doc.summary() or ""
+        txt = _bs4_text(summary_html)
+        if txt and len(txt) >= 300:
+            return _clean_text(txt)
+    except Exception:
+        pass
+    # Last resort: whole page text
+    try:
+        txt = _bs4_text(html)
+        if txt and len(txt) >= 200:
+            return _clean_text(txt)
+    except Exception:
+        pass
+    return None
+def _split_paragraphs(text: str) -> List[str]:
+    # Split on blank lines first
+    parts = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
+    out: list[str] = []
+    for p in parts:
+        # Further split very long paragraphs by sentence groups
+        if len(p) > 1200:
+            chunks = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", p)
+            buf = []
+            cur = ""
+            for s in chunks:
+                cur = (cur + " " + s).strip()
+                if len(cur) >= 400:
+                    buf.append(cur)
+                    cur = ""
+            if cur:
+                buf.append(cur)
+            out.extend(buf)
+        else:
+            out.append(p)
+    # Filter short or junky lines
+    out = [p for p in out if len(p) >= 160]
+    # Deduplicate
+    seen: set[str] = set()
+    deduped: list[str] = []
+    for p in out:
+        key = re.sub(r"\W+", " ", p).strip().casefold()
+        if key not in seen:
+            seen.add(key)
+            deduped.append(p)
+    return deduped[:12]  # cap
+async def get_paragraphs_for_url(url: str) -> List[str]:
+    html = await fetch_html(url)
+    if not html:
+        return []
+    text = extract_main_text(html, base_url=url)
+    if not text:
+        return []
+    return _split_paragraphs(text)
+async def get_paragraphs_with_fallback(url: str, snippet: str | None) -> List[str]:
+    paras = await get_paragraphs_for_url(url)
+    if paras:
+        return paras
+    return [snippet] if snippet else []

app/main.py CHANGED Viewed

@@ -28,6 +28,14 @@ async def _search(q: str = Query(..., min_length=3, max_length=200)):
     return {"count": len(results), "items": [r.model_dump() for r in results]}
 # Root endpoint
 @app.get("/")
 async def root():

     return {"count": len(results), "items": [r.model_dump() for r in results]}
+@app.get("/_fetch")
+async def _fetch(u: str = Query(..., min_length=10, max_length=2000)):
+    """Debug fetch endpoint for testing URL content extraction."""
+    from app.fetch.fetcher import get_paragraphs_for_url
+    paras = await get_paragraphs_for_url(u)
+    return {"count": len(paras), "samples": paras[:3]}
 # Root endpoint
 @app.get("/")
 async def root():

requirements.txt CHANGED Viewed

@@ -6,6 +6,8 @@ uvicorn[standard]==0.24.0
 httpx==0.25.2
 trafilatura==1.8.0
 readability-lxml==0.8.1
 # ML and NLP
 transformers==4.35.2

 httpx==0.25.2
 trafilatura==1.8.0
 readability-lxml==0.8.1
+beautifulsoup4==4.12.2
+lxml==4.9.3
 # ML and NLP
 transformers==4.35.2