Spaces:
No application file
No application file
| # scraper.py | |
| import requests | |
| from readability import Document | |
| from bs4 import BeautifulSoup | |
| from newspaper import Article | |
| from utils import canonicalize_url | |
| from urllib.parse import urlparse | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"} | |
| def fetch(url, timeout=10, allow_redirects=True): | |
| try: | |
| r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects) | |
| r.raise_for_status() | |
| return r.text, r.url | |
| except Exception as e: | |
| logger.warning("fetch failed %s: %s", url, e) | |
| return None, url | |
| def extract(html, url): | |
| # Primary: readability | |
| doc = Document(html) | |
| title = doc.short_title() or "" | |
| summary_html = doc.summary() | |
| soup = BeautifulSoup(summary_html, "html.parser") | |
| main_text = soup.get_text(separator="\n").strip() | |
| # fallback to newspaper to get meta like authors, publish_date | |
| article = Article(url) | |
| try: | |
| article.download(input_html=html) | |
| article.parse() | |
| except Exception: | |
| pass | |
| authors = article.authors or [] | |
| publish_date = article.publish_date | |
| # split into sentences (very simple) | |
| sentences = [s.strip() for s in main_text.splitlines() if s.strip()] | |
| # also gather top images | |
| page_soup = BeautifulSoup(html, "html.parser") | |
| images = [] | |
| for img in page_soup.find_all("img")[:6]: | |
| src = img.get("src") or img.get("data-src") | |
| if src: | |
| images.append(canonicalize_url(src, base=url)) | |
| return { | |
| "url": url, | |
| "title": title, | |
| "authors": authors, | |
| "publish_date": publish_date, | |
| "text": main_text, | |
| "sentences": sentences, | |
| "images": images | |
| } | |