Spaces:
No application file
No application file
| # app.py | |
| import gradio as gr | |
| from scraper import fetch, extract | |
| from nlp_pipeline import process_document, embed_text | |
| from vector_store import SimpleVectorStore | |
| import numpy as np | |
| import time | |
| # init store (summary embedding dim from model) | |
| DIM = 384 # all-MiniLM-L6-v2 => 384 | |
| store = SimpleVectorStore(dim=DIM) | |
| def crawl_and_index(url): | |
| html, final_url = fetch(url) | |
| if not html: | |
| return "fetch failed", None | |
| doc = extract(html, final_url) | |
| nlp = process_document(doc) | |
| # simple dedupe: search against store and check similarity | |
| qvec = nlp["embedding"] | |
| if store.index.ntotal > 0: | |
| hits = store.search(qvec, k=3) | |
| if hits and hits[0][0] > 0.90: # very similar (cosine) | |
| return "duplicate/updated - skipped", hits[0][1] | |
| meta = { | |
| "url": doc["url"], | |
| "title": doc["title"], | |
| "summary": nlp["summary"], | |
| "entities": nlp["entities"], | |
| "provenance": nlp["provenance"], | |
| "publish_date": str(doc.get("publish_date")), | |
| "timestamp": time.time() | |
| } | |
| store.add(qvec, meta) | |
| return "indexed", meta | |
| def semantic_search(query, k=5): | |
| qvec = embed_text(query) | |
| hits = store.search(qvec, k=k) | |
| out = [] | |
| for score, meta in hits: | |
| out.append({ | |
| "score": round(score, 4), | |
| "title": meta["title"], | |
| "summary": meta["summary"], | |
| "url": meta["url"], | |
| "publish_date": meta["publish_date"] | |
| }) | |
| return out | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# NLP Web Scraper (HF Space demo)") | |
| with gr.Row(): | |
| url_input = gr.Textbox(label="Seed URL", placeholder="https://example.com/article") | |
| crawl_btn = gr.Button("Crawl & Index") | |
| status = gr.Label() | |
| result_box = gr.JSON(label="Indexed document metadata") | |
| crawl_btn.click(crawl_and_index, inputs=url_input, outputs=[status, result_box]) | |
| gr.Markdown("## Semantic Search") | |
| query = gr.Textbox(label="Query") | |
| k = gr.Slider(1, 10, value=5, step=1, label="Top K") | |
| search_btn = gr.Button("Search") | |
| search_results = gr.Dataframe(headers=["score","title","summary","url","publish_date"], datatype="json") | |
| search_btn.click(semantic_search, inputs=[query, k], outputs=search_results) | |
| if __name__ == "__main__": | |
| demo.launch() | |