Source code for eval_framework.metrics.loglikelihood.bits_per_byte

import math

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Error, Loglikelihood


[docs] class BitsPerByteLoglikelihood(BaseMetric[Loglikelihood]): """ Bits-per-byte metric for loglikelihood responses. This follows the Paloma definition: the negative log-likelihood of the answer divided by the number of UTF-8 bytes in the answer string. """ NAME = "BitsPerByte"
[docs] def calculate(self, response: Loglikelihood) -> list[MetricResult]: if response.error: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)] ground_truth_list = response.ground_truth_list # Find a ground-truth string that we have a loglikelihood for. log_p_x: float | None = None answer_text: str | None = None for gt in ground_truth_list: if gt is None: continue if gt in response.loglikelihoods: answer_text = gt log_p_x = float(response.loglikelihoods[gt]) break if log_p_x is None or answer_text is None: return [ MetricResult( metric_name=self.NAME, value=None, higher_is_better=False, error=response.error or Error( error_class="ValueError", message="No ground-truth answer found in loglikelihoods", traceback="", ), ) ] num_bytes = len(answer_text.encode("utf-8")) if num_bytes == 0: return [ MetricResult( metric_name=self.NAME, value=None, higher_is_better=False, error=response.error or Error( error_class="ValueError", message="Ground-truth answer has zero UTF-8 bytes", traceback="", ), ) ] bits_per_byte = -log_p_x / (num_bytes * math.log(2)) return [ MetricResult( metric_name=self.NAME, value=bits_per_byte, higher_is_better=False, error=response.error, ) ]