Renangi's picture
Request too large ... TPM 6000, Requested 6841 Reduce max_tokens for the judge
08d13a6
from typing import Any, Dict, List, Tuple
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
def _all_sentence_keys(
docs_sentences: List[List[Tuple[str, str]]]
) -> List[str]:
keys: List[str] = []
for doc in docs_sentences:
for key, _ in doc:
keys.append(key)
return keys
def trace_from_attributes(
attrs: Dict[str, Any],
docs_sentences: List[List[Tuple[str, str]]],
) -> Dict[str, float]:
all_keys = _all_sentence_keys(docs_sentences)
total = len(all_keys)
if total == 0:
return {
"relevance": 0.0,
"utilization": 0.0,
"completeness": 0.0,
"adherence": 0.0,
}
relevant = set(attrs.get("all_relevant_sentence_keys", [])) & set(all_keys)
utilized = set(attrs.get("all_utilized_sentence_keys", [])) & set(all_keys)
relevance = len(relevant) / total if total > 0 else 0.0
utilization = len(utilized) / total if total > 0 else 0.0
completeness = (
len(relevant & utilized) / len(relevant) if relevant else 0.0
)
adherence = 1.0 if attrs.get("overall_supported", False) else 0.0
return {
"relevance": float(relevance),
"utilization": float(utilization),
"completeness": float(completeness),
"adherence": float(adherence),
}
def compute_rmse_auc(
y_true_rel: List[float],
y_pred_rel: List[float],
y_true_util: List[float],
y_pred_util: List[float],
y_true_comp: List[float],
y_pred_comp: List[float],
y_true_adh: List[int],
y_pred_adh: List[float],
) -> Dict[str, float]:
metrics = {
"rmse_relevance": float(
mean_squared_error(y_true_rel, y_pred_rel, squared=False)
),
"rmse_utilization": float(
mean_squared_error(y_true_util, y_pred_util, squared=False)
),
"rmse_completeness": float(
mean_squared_error(y_true_comp, y_pred_comp, squared=False)
),
}
if len(set(y_true_adh)) > 1:
metrics["auroc_adherence"] = float(
roc_auc_score(y_true_adh, y_pred_adh)
)
else:
#metrics["auroc_adherence"] = float("nan")
metrics["auroc_adherence"] = 0.5 # or None, but not float("nan")
return metrics