# scraper.py import requests from readability import Document from bs4 import BeautifulSoup from newspaper import Article from utils import canonicalize_url from urllib.parse import urlparse import logging logger = logging.getLogger(__name__) HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"} def fetch(url, timeout=10, allow_redirects=True): try: r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects) r.raise_for_status() return r.text, r.url except Exception as e: logger.warning("fetch failed %s: %s", url, e) return None, url def extract(html, url): # Primary: readability doc = Document(html) title = doc.short_title() or "" summary_html = doc.summary() soup = BeautifulSoup(summary_html, "html.parser") main_text = soup.get_text(separator="\n").strip() # fallback to newspaper to get meta like authors, publish_date article = Article(url) try: article.download(input_html=html) article.parse() except Exception: pass authors = article.authors or [] publish_date = article.publish_date # split into sentences (very simple) sentences = [s.strip() for s in main_text.splitlines() if s.strip()] # also gather top images page_soup = BeautifulSoup(html, "html.parser") images = [] for img in page_soup.find_all("img")[:6]: src = img.get("src") or img.get("data-src") if src: images.append(canonicalize_url(src, base=url)) return { "url": url, "title": title, "authors": authors, "publish_date": publish_date, "text": main_text, "sentences": sentences, "images": images }