Source code for eval_framework.metrics.base

import traceback
from abc import ABC, abstractmethod
from typing import Any

from pydantic import BaseModel, ConfigDict

from eval_framework.metrics.aggregators.aggregators import Aggregator
from eval_framework.shared.types import Error
from eval_framework.tasks.utils import raise_errors


[docs] class MetricResult(BaseModel): model_config = ConfigDict(extra="forbid") metric_name: str value: float | None higher_is_better: bool llm_judge_prompt: str | None = None llm_judge_response: str | None = None code_execution_trace: str | None = None error: Error | None = None
[docs] class classproperty: def __init__(self, method: Any) -> None: self.method = method def __get__(self, instance: Any, cls: Any) -> Any: return self.method(cls)
[docs] class BaseMetric[Response](ABC): NAME: str KEYS: list[str] | None = None # The aggregator determines how to aggregate the results of a metric for a single # sample over multiple runs (LLM calls). We default to averaging and thus making # macro averaging the overall computation default. AGGREGATORS: list[Aggregator] = [] # Set by the evaluation generator before calculate(); controls how infra failures are handled. fail_on_error: bool = True @classproperty def NAMES(cls) -> list[str]: if cls.KEYS is None: return [cls.NAME] return [f"{cls.NAME}/{k}" for k in cls.KEYS]
[docs] @abstractmethod def calculate(self, response: Response) -> list[MetricResult]: raise NotImplementedError
def _record_or_raise(self, exc: Exception) -> list[MetricResult]: """Infra failure (e.g. a Docker image-pull rate limit): abort when fail_on_error is set, otherwise record a per-sample error so the run continues.""" if raise_errors() or self.fail_on_error: raise exc return [ MetricResult( metric_name=self.NAME, value=None, higher_is_better=True, error=Error(error_class=exc.__class__.__name__, message=str(exc), traceback=traceback.format_exc()), ) ]