nlp-web-scraper / app.py
kuldeep0204's picture
Create app.py
0910ffa verified
# app.py
import gradio as gr
from scraper import fetch, extract
from nlp_pipeline import process_document, embed_text
from vector_store import SimpleVectorStore
import numpy as np
import time
# init store (summary embedding dim from model)
DIM = 384 # all-MiniLM-L6-v2 => 384
store = SimpleVectorStore(dim=DIM)
def crawl_and_index(url):
html, final_url = fetch(url)
if not html:
return "fetch failed", None
doc = extract(html, final_url)
nlp = process_document(doc)
# simple dedupe: search against store and check similarity
qvec = nlp["embedding"]
if store.index.ntotal > 0:
hits = store.search(qvec, k=3)
if hits and hits[0][0] > 0.90: # very similar (cosine)
return "duplicate/updated - skipped", hits[0][1]
meta = {
"url": doc["url"],
"title": doc["title"],
"summary": nlp["summary"],
"entities": nlp["entities"],
"provenance": nlp["provenance"],
"publish_date": str(doc.get("publish_date")),
"timestamp": time.time()
}
store.add(qvec, meta)
return "indexed", meta
def semantic_search(query, k=5):
qvec = embed_text(query)
hits = store.search(qvec, k=k)
out = []
for score, meta in hits:
out.append({
"score": round(score, 4),
"title": meta["title"],
"summary": meta["summary"],
"url": meta["url"],
"publish_date": meta["publish_date"]
})
return out
with gr.Blocks() as demo:
gr.Markdown("# NLP Web Scraper (HF Space demo)")
with gr.Row():
url_input = gr.Textbox(label="Seed URL", placeholder="https://example.com/article")
crawl_btn = gr.Button("Crawl & Index")
status = gr.Label()
result_box = gr.JSON(label="Indexed document metadata")
crawl_btn.click(crawl_and_index, inputs=url_input, outputs=[status, result_box])
gr.Markdown("## Semantic Search")
query = gr.Textbox(label="Query")
k = gr.Slider(1, 10, value=5, step=1, label="Top K")
search_btn = gr.Button("Search")
search_results = gr.Dataframe(headers=["score","title","summary","url","publish_date"], datatype="json")
search_btn.click(semantic_search, inputs=[query, k], outputs=search_results)
if __name__ == "__main__":
demo.launch()