from typing import Any, Dict, List, Tuple import numpy as np from sklearn.metrics import mean_squared_error, roc_auc_score def _all_sentence_keys( docs_sentences: List[List[Tuple[str, str]]] ) -> List[str]: keys: List[str] = [] for doc in docs_sentences: for key, _ in doc: keys.append(key) return keys def trace_from_attributes( attrs: Dict[str, Any], docs_sentences: List[List[Tuple[str, str]]], ) -> Dict[str, float]: all_keys = _all_sentence_keys(docs_sentences) total = len(all_keys) if total == 0: return { "relevance": 0.0, "utilization": 0.0, "completeness": 0.0, "adherence": 0.0, } relevant = set(attrs.get("all_relevant_sentence_keys", [])) & set(all_keys) utilized = set(attrs.get("all_utilized_sentence_keys", [])) & set(all_keys) relevance = len(relevant) / total if total > 0 else 0.0 utilization = len(utilized) / total if total > 0 else 0.0 completeness = ( len(relevant & utilized) / len(relevant) if relevant else 0.0 ) adherence = 1.0 if attrs.get("overall_supported", False) else 0.0 return { "relevance": float(relevance), "utilization": float(utilization), "completeness": float(completeness), "adherence": float(adherence), } def compute_rmse_auc( y_true_rel: List[float], y_pred_rel: List[float], y_true_util: List[float], y_pred_util: List[float], y_true_comp: List[float], y_pred_comp: List[float], y_true_adh: List[int], y_pred_adh: List[float], ) -> Dict[str, float]: metrics = { "rmse_relevance": float( mean_squared_error(y_true_rel, y_pred_rel, squared=False) ), "rmse_utilization": float( mean_squared_error(y_true_util, y_pred_util, squared=False) ), "rmse_completeness": float( mean_squared_error(y_true_comp, y_pred_comp, squared=False) ), } if len(set(y_true_adh)) > 1: metrics["auroc_adherence"] = float( roc_auc_score(y_true_adh, y_pred_adh) ) else: #metrics["auroc_adherence"] = float("nan") metrics["auroc_adherence"] = 0.5 # or None, but not float("nan") return metrics