File size: 1,771 Bytes
c9ec5ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# scraper.py
import requests
from readability import Document
from bs4 import BeautifulSoup
from newspaper import Article
from utils import canonicalize_url
from urllib.parse import urlparse
import logging

logger = logging.getLogger(__name__)

HEADERS = {"User-Agent": "nlp-web-scraper/1.0 (+https://huggingface.co/)"}

def fetch(url, timeout=10, allow_redirects=True):
    try:
        r = requests.get(url, timeout=timeout, headers=HEADERS, allow_redirects=allow_redirects)
        r.raise_for_status()
        return r.text, r.url
    except Exception as e:
        logger.warning("fetch failed %s: %s", url, e)
        return None, url

def extract(html, url):
    # Primary: readability
    doc = Document(html)
    title = doc.short_title() or ""
    summary_html = doc.summary()
    soup = BeautifulSoup(summary_html, "html.parser")
    main_text = soup.get_text(separator="\n").strip()
    # fallback to newspaper to get meta like authors, publish_date
    article = Article(url)
    try:
        article.download(input_html=html)
        article.parse()
    except Exception:
        pass
    authors = article.authors or []
    publish_date = article.publish_date
    # split into sentences (very simple)
    sentences = [s.strip() for s in main_text.splitlines() if s.strip()]
    # also gather top images
    page_soup = BeautifulSoup(html, "html.parser")
    images = []
    for img in page_soup.find_all("img")[:6]:
        src = img.get("src") or img.get("data-src")
        if src:
            images.append(canonicalize_url(src, base=url))
    return {
        "url": url,
        "title": title,
        "authors": authors,
        "publish_date": publish_date,
        "text": main_text,
        "sentences": sentences,
        "images": images
    }