Spaces:

AliHashir
/

ai_for_all

Sleeping

App Files Files Community

AliHashir commited on Aug 30

Commit

947ed8c

1 Parent(s): 2d6a8ea

feat: implement evidence selection logic and add debug endpoint

Browse files

Files changed (4) hide show

app/logic/selector.py +62 -1
app/main.py +10 -0
app/nlp/embed.py +26 -1
requirements.txt +3 -1

app/logic/selector.py CHANGED Viewed

	@@ -1 +1,62 @@
1	- ~~"""Evidence~~ ~~selection and ranking~~ logic.~~"""~~

+# app/logic/selector.py
+from __future__ import annotations
+import asyncio
+from typing import List
+import numpy as np
+from app.schemas import Source
+from app.fetch.fetcher import get_paragraphs_with_fallback
+from app.nlp.embed import embed_text, embed_texts
+SIM_THRESHOLD = 0.25  # drop very weak matches
+async def select_evidence(
+    claim: str,
+    sources: List[Source],
+    per_source: int = 2,
+    max_total: int = 8,
+) -> List[Source]:
+    claim_vec = embed_text(claim)
+    # fetch paragraphs concurrently
+    tasks = [get_paragraphs_with_fallback(s.url, s.snippet) for s in sources]
+    all_paras = await asyncio.gather(*tasks)
+    selected_sources: list[Source] = []
+    for s, paras in zip(sources, all_paras):
+        if not paras:
+            selected_sources.append(s)
+            continue
+        para_vecs = embed_texts(paras)
+        sims = para_vecs @ claim_vec  # cosine because normalized
+        top_idx = np.argsort(-sims)[:per_source]
+        evidence: list[str] = []
+        for i in top_idx:
+            score = float(sims[i])
+            if score < SIM_THRESHOLD:
+                continue
+            text = paras[i].strip()
+            if len(text) > 500:
+                text = text[:497] + "..."
+            evidence.append(text)
+        selected_sources.append(
+            Source(title=s.title, url=s.url, snippet=s.snippet, evidence=evidence)
+        )
+    # cap total evidence across all sources
+    def total_evidence() -> int:
+        return sum(len(s.evidence) for s in selected_sources)
+    if total_evidence() > max_total:
+        # trim round-robin
+        while total_evidence() > max_total:
+            for s in selected_sources:
+                if s.evidence:
+                    s.evidence.pop()
+                if total_evidence() <= max_total:
+                    break
+    return selected_sources

app/main.py CHANGED Viewed

@@ -36,6 +36,16 @@ async def _fetch(u: str = Query(..., min_length=10, max_length=2000)):
     return {"count": len(paras), "samples": paras[:3]}
 # Root endpoint
 @app.get("/")
 async def root():

     return {"count": len(paras), "samples": paras[:3]}
+@app.get("/_select")
+async def _select(claim: str = Query(..., min_length=8, max_length=300)):
+    """Debug select endpoint for testing evidence selection."""
+    from app.logic.selector import select_evidence
+    search = get_search()
+    sources = await search(claim)
+    picked = await select_evidence(claim, sources, per_source=2, max_total=8)
+    return {"n_sources": len(picked), "items": [s.model_dump() for s in picked]}
 # Root endpoint
 @app.get("/")
 async def root():

app/nlp/embed.py CHANGED Viewed

	@@ -1 +1,26 @@
1	- ~~"""Text~~ ~~embedding utilities using sentence-transformers~~.~~"""~~

+# app/nlp/embed.py
+from __future__ import annotations
+from functools import lru_cache
+import numpy as np
+from sentence_transformers import SentenceTransformer
+MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
+@lru_cache(maxsize=1)
+def _load_model() -> SentenceTransformer:
+    # CPU is fine for this model
+    return SentenceTransformer(MODEL_NAME)
+def embed_texts(texts: list[str]) -> np.ndarray:
+    model = _load_model()
+    vecs = model.encode(
+        texts,
+        batch_size=32,
+        convert_to_numpy=True,
+        normalize_embeddings=True,
+        show_progress_bar=False,
+    )
+    return vecs.astype("float32")
+def embed_text(text: str) -> np.ndarray:
+    return embed_texts([text])[0]

requirements.txt CHANGED Viewed

@@ -11,7 +11,7 @@ lxml==4.9.3
 # ML and NLP
 transformers==4.35.2
-sentence-transformers==2.2.2
 torch==2.1.1
 scikit-learn==1.3.2
@@ -20,3 +20,5 @@ python-dotenv==1.0.0
 # Optional web interface
 jinja2==3.1.2

 # ML and NLP
 transformers==4.35.2
+sentence-transformers==2.7.0
 torch==2.1.1
 scikit-learn==1.3.2
 # Optional web interface
 jinja2==3.1.2
+numpy==1.24.4