Spaces:

Renangi
/

ragbench-rag-eval

Running

App Files Files Community

ragbench-rag-eval / ragbench_eval /metrics.py

Renangi

Request too large ... TPM 6000, Requested 6841 Reduce max_tokens for the judge

08d13a6 25 days ago

raw

history blame contribute delete

2.3 kB

	from typing import Any, Dict, List, Tuple
	import numpy as np
	from sklearn.metrics import mean_squared_error, roc_auc_score


	def _all_sentence_keys(
	docs_sentences: List[List[Tuple[str, str]]]
	) -> List[str]:
	keys: List[str] = []
	for doc in docs_sentences:
	for key, _ in doc:
	keys.append(key)
	return keys


	def trace_from_attributes(
	attrs: Dict[str, Any],
	docs_sentences: List[List[Tuple[str, str]]],
	) -> Dict[str, float]:
	all_keys = _all_sentence_keys(docs_sentences)
	total = len(all_keys)
	if total == 0:
	return {
	"relevance": 0.0,
	"utilization": 0.0,
	"completeness": 0.0,
	"adherence": 0.0,
	}

	relevant = set(attrs.get("all_relevant_sentence_keys", [])) & set(all_keys)
	utilized = set(attrs.get("all_utilized_sentence_keys", [])) & set(all_keys)

	relevance = len(relevant) / total if total > 0 else 0.0
	utilization = len(utilized) / total if total > 0 else 0.0
	completeness = (
	len(relevant & utilized) / len(relevant) if relevant else 0.0
	)
	adherence = 1.0 if attrs.get("overall_supported", False) else 0.0

	return {
	"relevance": float(relevance),
	"utilization": float(utilization),
	"completeness": float(completeness),
	"adherence": float(adherence),
	}


	def compute_rmse_auc(
	y_true_rel: List[float],
	y_pred_rel: List[float],
	y_true_util: List[float],
	y_pred_util: List[float],
	y_true_comp: List[float],
	y_pred_comp: List[float],
	y_true_adh: List[int],
	y_pred_adh: List[float],
	) -> Dict[str, float]:
	metrics = {
	"rmse_relevance": float(
	mean_squared_error(y_true_rel, y_pred_rel, squared=False)
	),
	"rmse_utilization": float(
	mean_squared_error(y_true_util, y_pred_util, squared=False)
	),
	"rmse_completeness": float(
	mean_squared_error(y_true_comp, y_pred_comp, squared=False)
	),
	}

	if len(set(y_true_adh)) > 1:
	metrics["auroc_adherence"] = float(
	roc_auc_score(y_true_adh, y_pred_adh)
	)
	else:
	#metrics["auroc_adherence"] = float("nan")
	metrics["auroc_adherence"] = 0.5 # or None, but not float("nan")

	return metrics