Spaces:

Renangi
/

ragbench-rag-eval

Running

App Files Files Community

Renangi commited on 25 days ago

Commit

448ff59

1 Parent(s): 08d13a6

Request too large ... TPM 6000, Requested 6841 Reduce max_tokens for the judge 2nd time bug fix

Browse files

Files changed (1) hide show

ragbench_eval/judge.py +109 -9

ragbench_eval/judge.py CHANGED Viewed

@@ -1,22 +1,63 @@
 import json
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 from .llm import LLMClient
 from .config import JUDGE_MODEL
 def format_docs_with_keys(
     documents_sentences: List[List[Tuple[str, str]]]
 ) -> str:
-    blocks = []
     for doc in documents_sentences:
         for key, sent in doc:
             blocks.append(f"{key}: {sent}")
-        blocks.append("")  # blank line
     return "\n".join(blocks).strip()
 class RAGJudge:
     def __init__(self, prompt_path: str = "prompts/ragbench_judge_prompt.txt"):
         self.client = LLMClient(JUDGE_MODEL)
@@ -28,12 +69,18 @@ class RAGJudge:
         answer: str,
         docs_sentences: List[List[Tuple[str, str]]],
     ) -> Dict[str, Any]:
         docs_block = format_docs_with_keys(docs_sentences)
         prompt = self.prompt_template.format(
             documents=docs_block,
             question=question,
             answer=answer,
         )
         messages = [
             {
                 "role": "system",
@@ -41,21 +88,74 @@ class RAGJudge:
             },
             {"role": "user", "content": prompt},
         ]
-        #raw = self.client.chat(messages, max_tokens=2048)
-        raw = self.client.chat(messages, max_tokens=512)
         try:
             data = json.loads(raw)
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Judge JSON parse error: {e}\nRaw: {raw[:500]}")
-        for key in [
             "relevance_explanation",
             "all_relevant_sentence_keys",
             "overall_supported_explanation",
             "overall_supported",
             "sentence_support_information",
             "all_utilized_sentence_keys",
-        ]:
             if key not in data:
-                raise ValueError(f"Missing key in judge output: {key}")
         return data

 import json
+import logging
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 from .llm import LLMClient
 from .config import JUDGE_MODEL
+logger = logging.getLogger(__name__)
+# Hard limits to stay under Groq's token constraints
+# Rough rule-of-thumb: 4 characters ≈ 1 token in English.
+MAX_DOC_CHARS = 8000       # limit for the "documents" block
+MAX_PROMPT_CHARS = 12000   # limit for the full judge prompt
 def format_docs_with_keys(
     documents_sentences: List[List[Tuple[str, str]]]
 ) -> str:
+    """
+    Turn a nested list of (sentence_key, sentence_text) into the flat
+    `<key>: <text>` format expected by the judge prompt.
+    """
+    blocks: List[str] = []
     for doc in documents_sentences:
         for key, sent in doc:
             blocks.append(f"{key}: {sent}")
+        # blank line between documents
+        blocks.append("")
     return "\n".join(blocks).strip()
+def _truncate(text: str, limit: int) -> str:
+    """
+    Truncate a long string to at most `limit` characters, appending a marker
+    so the judge knows context was cut.
+    """
+    if len(text) <= limit:
+        return text
+    return text[:limit] + "\n[TRUNCATED]\n"
+def _default_annotation(reason: str) -> Dict[str, Any]:
+    """
+    Safe fallback annotation used when the judge LLM fails
+    (size/rate limit, invalid JSON, etc.).
+    """
+    return {
+        "relevance_explanation": f"Automatic fallback: {reason}",
+        "all_relevant_sentence_keys": [],
+        "overall_supported_explanation": (
+            "No reliable judgement could be produced because the judge LLM "
+            "call failed or the output was not valid JSON."
+        ),
+        "overall_supported": False,
+        "sentence_support_information": [],
+        "all_utilized_sentence_keys": [],
+    }
 class RAGJudge:
     def __init__(self, prompt_path: str = "prompts/ragbench_judge_prompt.txt"):
         self.client = LLMClient(JUDGE_MODEL)
         answer: str,
         docs_sentences: List[List[Tuple[str, str]]],
     ) -> Dict[str, Any]:
+        # 1) Format docs and truncate to stay under token limits
         docs_block = format_docs_with_keys(docs_sentences)
+        docs_block = _truncate(docs_block, MAX_DOC_CHARS)
+        # 2) Build prompt and also apply a global char-limit
         prompt = self.prompt_template.format(
             documents=docs_block,
             question=question,
             answer=answer,
         )
+        prompt = _truncate(prompt, MAX_PROMPT_CHARS)
         messages = [
             {
                 "role": "system",
             },
             {"role": "user", "content": prompt},
         ]
+        # 3) Call LLM with smaller max_tokens and catch Groq 413 / rate limit errors
+        try:
+            raw = self.client.chat(messages, max_tokens=512)
+        except Exception as e:
+            msg = str(e)
+            if (
+                "rate_limit_exceeded" in msg
+                or "Request too large" in msg
+                or "413" in msg
+            ):
+                logger.warning("Judge LLM call failed due to size/limit: %s", msg)
+                return _default_annotation(
+                    "judge LLM request was too large or hit a rate limit."
+                )
+            # Other errors should still surface
+            raise
+        if not isinstance(raw, str):
+            raw = str(raw)
+        # 4) Parse JSON robustly
         try:
             data = json.loads(raw)
+        except json.JSONDecodeError:
+            # Try to salvage JSON between first '{' and last '}'
+            start = raw.find("{")
+            end = raw.rfind("}")
+            if start != -1 and end != -1 and end > start:
+                candidate = raw[start : end + 1]
+                try:
+                    data = json.loads(candidate)
+                except json.JSONDecodeError as e2:
+                    logger.error("Judge JSON parse error after salvage: %s", e2)
+                    logger.debug(
+                        "Raw judge output (first 500 chars): %s", raw[:500]
+                    )
+                    return _default_annotation("could not parse judge JSON output.")
+            else:
+                logger.error(
+                    "Judge JSON parse error: could not find JSON object in output."
+                )
+                logger.debug(
+                    "Raw judge output (first 500 chars): %s", raw[:500]
+                )
+                return _default_annotation("could not parse judge JSON output.")
+        # 5) Ensure required keys exist; fill missing with safe defaults
+        required_keys = [
             "relevance_explanation",
             "all_relevant_sentence_keys",
             "overall_supported_explanation",
             "overall_supported",
             "sentence_support_information",
             "all_utilized_sentence_keys",
+        ]
+        for key in required_keys:
             if key not in data:
+                if key in ("relevance_explanation", "overall_supported_explanation"):
+                    data[key] = ""
+                elif key in (
+                    "all_relevant_sentence_keys",
+                    "sentence_support_information",
+                    "all_utilized_sentence_keys",
+                ):
+                    data[key] = []
+                elif key == "overall_supported":
+                    data[key] = False
         return data