AliHashir commited on
Commit
2d6a8ea
·
1 Parent(s): 3721c3d

feat: add URL fetching and content extraction functionality

Browse files
Files changed (3) hide show
  1. app/fetch/fetcher.py +135 -1
  2. app/main.py +8 -0
  3. requirements.txt +2 -0
app/fetch/fetcher.py CHANGED
@@ -1 +1,135 @@
1
- """Web content fetching and text extraction."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/fetch/fetcher.py
2
+ from __future__ import annotations
3
+ import re
4
+ from typing import List, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ import httpx
8
+ from bs4 import BeautifulSoup
9
+ from readability import Document
10
+ import trafilatura
11
+
12
+ USER_AGENT = (
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
14
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
15
+ "Chrome/120.0.0.0 Safari/537.36"
16
+ )
17
+
18
+ HEADERS = {
19
+ "User-Agent": USER_AGENT,
20
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
21
+ "Accept-Language": "en;q=0.9",
22
+ }
23
+
24
+ TIMEOUT = httpx.Timeout(10.0, connect=5.0)
25
+ BLOCKED_SCHEMES = {"javascript", "data"}
26
+ BLOCKED_EXTS = {".pdf", ".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg"}
27
+
28
+ def _looks_blocked(url: str) -> bool:
29
+ try:
30
+ p = urlparse(url)
31
+ if p.scheme in BLOCKED_SCHEMES:
32
+ return True
33
+ for ext in BLOCKED_EXTS:
34
+ if p.path.lower().endswith(ext):
35
+ return True
36
+ except Exception:
37
+ return True
38
+ return False
39
+
40
+ async def fetch_html(url: str) -> Optional[str]:
41
+ if _looks_blocked(url):
42
+ return None
43
+ async with httpx.AsyncClient(headers=HEADERS, timeout=TIMEOUT, follow_redirects=True) as client:
44
+ resp = await client.get(url)
45
+ ct = resp.headers.get("Content-Type", "")
46
+ if "text/html" not in ct and "application/xhtml+xml" not in ct:
47
+ return None
48
+ resp.raise_for_status()
49
+ return resp.text
50
+
51
+ def _clean_text(txt: str) -> str:
52
+ txt = re.sub(r"\r\n|\r", "\n", txt)
53
+ txt = re.sub(r"[ \t]+", " ", txt)
54
+ txt = re.sub(r"\n{3,}", "\n\n", txt)
55
+ return txt.strip()
56
+
57
+ def _bs4_text(html: str) -> str:
58
+ soup = BeautifulSoup(html, "lxml")
59
+ # Drop nav, footer, script, style
60
+ for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "aside"]):
61
+ tag.decompose()
62
+ return _clean_text(soup.get_text("\n"))
63
+
64
+ def extract_main_text(html: str, base_url: str | None = None) -> Optional[str]:
65
+ # Try trafilatura first
66
+ try:
67
+ txt = trafilatura.extract(html, url=base_url, include_comments=False, include_tables=False)
68
+ if txt and len(txt) >= 400:
69
+ return _clean_text(txt)
70
+ except Exception:
71
+ pass
72
+ # Fallback to readability-lxml
73
+ try:
74
+ doc = Document(html)
75
+ summary_html = doc.summary() or ""
76
+ txt = _bs4_text(summary_html)
77
+ if txt and len(txt) >= 300:
78
+ return _clean_text(txt)
79
+ except Exception:
80
+ pass
81
+ # Last resort: whole page text
82
+ try:
83
+ txt = _bs4_text(html)
84
+ if txt and len(txt) >= 200:
85
+ return _clean_text(txt)
86
+ except Exception:
87
+ pass
88
+ return None
89
+
90
+ def _split_paragraphs(text: str) -> List[str]:
91
+ # Split on blank lines first
92
+ parts = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
93
+ out: list[str] = []
94
+ for p in parts:
95
+ # Further split very long paragraphs by sentence groups
96
+ if len(p) > 1200:
97
+ chunks = re.split(r"(?<=[.!?])\s+(?=[A-Z0-9])", p)
98
+ buf = []
99
+ cur = ""
100
+ for s in chunks:
101
+ cur = (cur + " " + s).strip()
102
+ if len(cur) >= 400:
103
+ buf.append(cur)
104
+ cur = ""
105
+ if cur:
106
+ buf.append(cur)
107
+ out.extend(buf)
108
+ else:
109
+ out.append(p)
110
+ # Filter short or junky lines
111
+ out = [p for p in out if len(p) >= 160]
112
+ # Deduplicate
113
+ seen: set[str] = set()
114
+ deduped: list[str] = []
115
+ for p in out:
116
+ key = re.sub(r"\W+", " ", p).strip().casefold()
117
+ if key not in seen:
118
+ seen.add(key)
119
+ deduped.append(p)
120
+ return deduped[:12] # cap
121
+
122
+ async def get_paragraphs_for_url(url: str) -> List[str]:
123
+ html = await fetch_html(url)
124
+ if not html:
125
+ return []
126
+ text = extract_main_text(html, base_url=url)
127
+ if not text:
128
+ return []
129
+ return _split_paragraphs(text)
130
+
131
+ async def get_paragraphs_with_fallback(url: str, snippet: str | None) -> List[str]:
132
+ paras = await get_paragraphs_for_url(url)
133
+ if paras:
134
+ return paras
135
+ return [snippet] if snippet else []
app/main.py CHANGED
@@ -28,6 +28,14 @@ async def _search(q: str = Query(..., min_length=3, max_length=200)):
28
  return {"count": len(results), "items": [r.model_dump() for r in results]}
29
 
30
 
 
 
 
 
 
 
 
 
31
  # Root endpoint
32
  @app.get("/")
33
  async def root():
 
28
  return {"count": len(results), "items": [r.model_dump() for r in results]}
29
 
30
 
31
+ @app.get("/_fetch")
32
+ async def _fetch(u: str = Query(..., min_length=10, max_length=2000)):
33
+ """Debug fetch endpoint for testing URL content extraction."""
34
+ from app.fetch.fetcher import get_paragraphs_for_url
35
+ paras = await get_paragraphs_for_url(u)
36
+ return {"count": len(paras), "samples": paras[:3]}
37
+
38
+
39
  # Root endpoint
40
  @app.get("/")
41
  async def root():
requirements.txt CHANGED
@@ -6,6 +6,8 @@ uvicorn[standard]==0.24.0
6
  httpx==0.25.2
7
  trafilatura==1.8.0
8
  readability-lxml==0.8.1
 
 
9
 
10
  # ML and NLP
11
  transformers==4.35.2
 
6
  httpx==0.25.2
7
  trafilatura==1.8.0
8
  readability-lxml==0.8.1
9
+ beautifulsoup4==4.12.2
10
+ lxml==4.9.3
11
 
12
  # ML and NLP
13
  transformers==4.35.2