# app.py import gradio as gr from scraper import fetch, extract from nlp_pipeline import process_document, embed_text from vector_store import SimpleVectorStore import numpy as np import time # init store (summary embedding dim from model) DIM = 384 # all-MiniLM-L6-v2 => 384 store = SimpleVectorStore(dim=DIM) def crawl_and_index(url): html, final_url = fetch(url) if not html: return "fetch failed", None doc = extract(html, final_url) nlp = process_document(doc) # simple dedupe: search against store and check similarity qvec = nlp["embedding"] if store.index.ntotal > 0: hits = store.search(qvec, k=3) if hits and hits[0][0] > 0.90: # very similar (cosine) return "duplicate/updated - skipped", hits[0][1] meta = { "url": doc["url"], "title": doc["title"], "summary": nlp["summary"], "entities": nlp["entities"], "provenance": nlp["provenance"], "publish_date": str(doc.get("publish_date")), "timestamp": time.time() } store.add(qvec, meta) return "indexed", meta def semantic_search(query, k=5): qvec = embed_text(query) hits = store.search(qvec, k=k) out = [] for score, meta in hits: out.append({ "score": round(score, 4), "title": meta["title"], "summary": meta["summary"], "url": meta["url"], "publish_date": meta["publish_date"] }) return out with gr.Blocks() as demo: gr.Markdown("# NLP Web Scraper (HF Space demo)") with gr.Row(): url_input = gr.Textbox(label="Seed URL", placeholder="https://example.com/article") crawl_btn = gr.Button("Crawl & Index") status = gr.Label() result_box = gr.JSON(label="Indexed document metadata") crawl_btn.click(crawl_and_index, inputs=url_input, outputs=[status, result_box]) gr.Markdown("## Semantic Search") query = gr.Textbox(label="Query") k = gr.Slider(1, 10, value=5, step=1, label="Top K") search_btn = gr.Button("Search") search_results = gr.Dataframe(headers=["score","title","summary","url","publish_date"], datatype="json") search_btn.click(semantic_search, inputs=[query, k], outputs=search_results) if __name__ == "__main__": demo.launch()