Source code for eval_framework.metrics.loglikelihood.bits_per_byte
import math
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Error, Loglikelihood
[docs]
class BitsPerByteLoglikelihood(BaseMetric[Loglikelihood]):
"""
Bits-per-byte metric for loglikelihood responses.
This follows the Paloma definition: the negative log-likelihood of the
answer divided by the number of UTF-8 bytes in the answer string.
"""
NAME = "BitsPerByte"
[docs]
def calculate(self, response: Loglikelihood) -> list[MetricResult]:
if response.error:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=False, error=response.error)]
ground_truth_list = response.ground_truth_list
# Find a ground-truth string that we have a loglikelihood for.
log_p_x: float | None = None
answer_text: str | None = None
for gt in ground_truth_list:
if gt is None:
continue
if gt in response.loglikelihoods:
answer_text = gt
log_p_x = float(response.loglikelihoods[gt])
break
if log_p_x is None or answer_text is None:
return [
MetricResult(
metric_name=self.NAME,
value=None,
higher_is_better=False,
error=response.error
or Error(
error_class="ValueError",
message="No ground-truth answer found in loglikelihoods",
traceback="",
),
)
]
num_bytes = len(answer_text.encode("utf-8"))
if num_bytes == 0:
return [
MetricResult(
metric_name=self.NAME,
value=None,
higher_is_better=False,
error=response.error
or Error(
error_class="ValueError",
message="Ground-truth answer has zero UTF-8 bytes",
traceback="",
),
)
]
bits_per_byte = -log_p_x / (num_bytes * math.log(2))
return [
MetricResult(
metric_name=self.NAME,
value=bits_per_byte,
higher_is_better=False,
error=response.error,
)
]