Source code for eval_framework.metrics.loglikelihood.dcs

from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.loglikelihood.base import BaseLoglikelihoodMetric
from eval_framework.shared.types import Loglikelihood


[docs] class DistributionalCorrectnessScore(BaseLoglikelihoodMetric): """Based on Burns (2025) Measuring Language Model Hallucinations Through Distributional Correctness.""" NAME = "Distributional Correctness Score" def __init__( self, *, lc: float = 1.0, # Default reward weight for correct answers lw: float = 1.0, # Default penalty weight for wrong answers len_normalised: bool = True, ) -> None: super().__init__(len_normalised=len_normalised) self._lc = float(lc) self._lw = float(lw) if not (self._lc >= 0 and self._lw >= 0 and self._lc >= self._lw): raise ValueError(f"Invalid DCS loadings: lc={self._lc}, lw={self._lw}. Require lc>=0, lw>=0, and lc>=lw.")
[docs] def calculate(self, response: Loglikelihood) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] loglikelihoods, probs = self._compute_probabilities(response.loglikelihoods) ground_truths = self._gather_ground_truths(response) idk_key = self._normalise_text(list(response.loglikelihoods.keys())[-1]) # assumes last key is "IDK" option p_c = sum(p for k, p in probs.items() if self._normalise_text(k) in ground_truths) p_idk = probs.get(idk_key, 0.0) p_w = sum( p for k, p in probs.items() if (self._normalise_text(k) not in ground_truths and self._normalise_text(k) != idk_key) ) dcs = (self._lc * p_c - self._lw * p_w) * (1.0 - p_idk) return [MetricResult(metric_name=self.NAME, value=float(dcs), higher_is_better=True, error=response.error)]