import logging from pathlib import Path from typing import List, Tuple from .llm import LLMClient from .config import GEN_MODEL logger = logging.getLogger(__name__) def _truncate(text: str, max_chars: int) -> str: """ Hard truncate long strings to keep the prompt under the LLM TPM limit. """ if len(text) <= max_chars: return text # leave a small suffix to show that it was truncated return text[: max_chars - 32] + "... [TRUNCATED]" def format_docs_with_keys( documents_sentences: List[List[Tuple[str, str]]] ) -> str: """ Format retrieved documents for the generator prompt. documents_sentences: list over documents; each document is a list of (key, sentence) pairs. """ blocks: List[str] = [] for doc in documents_sentences: for key, sentence in doc: blocks.append(f"{key}: {sentence}") # blank line between documents blocks.append("") return "\n".join(blocks).strip() class RAGGenerator: """ RAG answer generator. - Builds a prompt using a template + retrieved documents - Truncates context so the request fits Groq's TPM limit - Calls the LLM via LLMClient - On any LLM error (including 413 / rate_limit_exceeded), returns a safe fallback answer string instead of raising. """ def __init__(self, client: LLMClient | None = None) -> None: self.client = client or LLMClient(GEN_MODEL) # Try to load a custom generator prompt; fall back to a default project_root = Path(__file__).resolve().parents[1] prompt_path = project_root / "prompts" / "ragbench_generator_prompt.txt" if prompt_path.exists(): self.prompt_template = prompt_path.read_text(encoding="utf-8") else: logger.warning( "Generator prompt file %s not found. Using default template.", prompt_path, ) self.prompt_template = ( "You are a helpful assistant in a retrieval-augmented " "question-answering system.\n\n" "You must answer ONLY using the information in the context " "below. If the context does not contain the answer, say you " "do not know.\n\n" "Context:\n{documents}\n\n" "Question: {question}\n\n" "Answer:" ) # Hard character limits to stay well under ~6000 tokens # 6000 chars ≈ 1500 tokens (roughly), which is safe for your TPM limit. self.max_docs_chars = 6000 self.max_prompt_chars = 9000 self.max_answer_tokens = 512 def build_prompt( self, question: str, documents_sentences: List[List[Tuple[str, str]]], ) -> str: """ Build and truncate the full LLM prompt string. """ docs_block = format_docs_with_keys(documents_sentences) docs_block = _truncate(docs_block, self.max_docs_chars) prompt = self.prompt_template.format( documents=docs_block, question=question, ) prompt = _truncate(prompt, self.max_prompt_chars) return prompt def generate( self, question: str, documents_sentences: List[List[Tuple[str, str]]], ) -> str: """ Generate an answer for a single question + retrieved documents. Returns a plain text answer string. Never raises LLM errors: instead, returns an explanatory fallback answer if the Groq call fails (e.g., 413 / TPM limit). """ prompt = self.build_prompt(question, documents_sentences) messages = [ { "role": "system", "content": ( "You are a retrieval-augmented QA assistant. " "Use only the given context to answer." ), }, {"role": "user", "content": prompt}, ] try: return self.client.chat( messages, max_tokens=self.max_answer_tokens, ) except Exception as e: # We log the full error for debugging, but return a safe text answer msg = str(e) logger.exception("Generator LLM call failed: %s", msg) # Handle Groq 413 / token limit errors more clearly if ( "rate_limit_exceeded" in msg or "tokens per minute" in msg or "Request too large" in msg ): return ( "The generator could not answer this question because the " "combined question and retrieved context are too large for " "the current LLM token limit. Please reduce the domain " "size or the number of retrieved documents and run again." ) # Generic fallback for any other LLM failure return ( "The generator failed due to an internal LLM error. " "Short description: " + msg[:400] )