Spaces:
No application file
No application file
| # nlp_pipeline.py | |
| from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| # Load lighter/CPU-friendly models for HF Space | |
| SUMMARIZER = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1) | |
| # NER model (token-classification) | |
| NER = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple", device=-1) | |
| EMBED_MODEL = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # small & fast | |
| def summarize(text, max_length=120): | |
| # chunk if needed | |
| if len(text) < 800: | |
| s = SUMMARIZER(text, max_length=max_length, min_length=40, do_sample=False) | |
| return s[0]["summary_text"] | |
| # naive chunking | |
| parts = [] | |
| chunk_size = 700 | |
| for i in range(0, len(text), chunk_size): | |
| chunk = text[i:i+chunk_size] | |
| parts.append(SUMMARIZER(chunk, max_length=60, min_length=20)[0]["summary_text"]) | |
| return " ".join(parts) | |
| def extract_entities(text): | |
| ner = NER(text) | |
| # ner returns list of {'entity_group','score','word'} | |
| grouped = {} | |
| for ent in ner: | |
| key = ent.get("entity_group") or ent.get("entity") | |
| grouped.setdefault(key, []).append({"text": ent["word"], "score": float(ent["score"])}) | |
| return grouped | |
| def embed_text(text): | |
| return EMBED_MODEL.encode(text, convert_to_numpy=True, normalize_embeddings=True) | |
| def get_sentence_provenance(sentences, entities): | |
| # map entity text to sentences that contain it (case-insensitive) | |
| prov = {} | |
| for t in entities: | |
| prov[t] = [] | |
| for s in sentences: | |
| if t.lower() in s.lower(): | |
| prov[t].append(s) | |
| return prov | |
| def process_document(doc): | |
| text = doc["text"] | |
| summary = summarize(text) | |
| entities_grouped = extract_entities(text) | |
| # flatten entity strings (unique) | |
| entity_texts = set() | |
| for k, v in entities_grouped.items(): | |
| for item in v: | |
| entity_texts.add(item["text"]) | |
| provenance = get_sentence_provenance(doc["sentences"], entity_texts) | |
| embedding = embed_text(summary) # index the summary embedding for compactness | |
| tags = [] # optional: simple tag by most frequent NER labels | |
| return { | |
| "summary": summary, | |
| "entities": entities_grouped, | |
| "entity_texts": list(entity_texts), | |
| "provenance": provenance, | |
| "embedding": embedding, | |
| "tags": tags | |
| } | |