Source code for eval_framework.metrics.completion.aidanbench

import logging

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion

logger = logging.getLogger(__name__)


[docs] class AidanBenchMetric(BaseMetric[Completion]): NAME = "AidanBench"
[docs] def calculate(self, response: Completion) -> list[MetricResult]: # subtract 2 to not count 1) initial instruction and 2) the latest model response, which caused the stop # i.e. was not (unique && coherent) num_unique_responses = len(response.messages) - 2 if response.messages is not None else 0 if num_unique_responses < 0: logger.warning( "Number of unique responses calculated as negative, setting to 0." "Likely something went wrong during answer generation." ) num_unique_responses = 0 return [ MetricResult( metric_name=f"{self.NAME}/num_responses", value=num_unique_responses, higher_is_better=True, ) ]