Source code for eval_framework.metrics.loglikelihood.probability_mass

import numpy as np

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Loglikelihood


[docs] class ProbabilityMass(BaseMetric[Loglikelihood]): NAME = "Probability Mass"
[docs] def calculate(self, response: Loglikelihood) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] assert isinstance(response.ground_truth, list) # https://docs.python.org/3.10/library/stdtypes.html?highlight=dictview#dictionary-view-objects in_ground_truths = [completion in response.ground_truth for completion in response.loglikelihoods] log_probs = list(response.loglikelihoods.values()) probs = np.exp(log_probs) / np.sum(np.exp(log_probs)) prob_mass = np.sum(probs[in_ground_truths]) return [ MetricResult(metric_name=self.NAME, value=float(prob_mass), higher_is_better=True, error=response.error) ]
[docs] class ProbabilityMassNorm(BaseMetric[Loglikelihood]): NAME = "Probability Mass Normalized"
[docs] def calculate(self, response: Loglikelihood) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] assert isinstance(response.ground_truth, list) # len normalized output_len_normalized = {} for k, v in response.loglikelihoods.items(): completion_length = len(k) if completion_length != 0: output_len_normalized[k] = v / completion_length else: output_len_normalized[k] = v log_probs = list(output_len_normalized.values()) in_ground_truths = [completion in response.ground_truth for completion in response.loglikelihoods] log_probs = list(output_len_normalized.values()) probs = np.exp(log_probs) / np.sum(np.exp(log_probs)) prob_mass_norm = np.sum(probs[in_ground_truths]) return [MetricResult(metric_name=self.NAME, value=prob_mass_norm, higher_is_better=True, error=response.error)]