Source code for eval_framework.metrics.base
import traceback
from abc import ABC, abstractmethod
from typing import Any
from pydantic import BaseModel, ConfigDict
from eval_framework.metrics.aggregators.aggregators import Aggregator
from eval_framework.shared.types import Error
from eval_framework.tasks.utils import raise_errors
[docs]
class MetricResult(BaseModel):
model_config = ConfigDict(extra="forbid")
metric_name: str
value: float | None
higher_is_better: bool
llm_judge_prompt: str | None = None
llm_judge_response: str | None = None
code_execution_trace: str | None = None
error: Error | None = None
[docs]
class classproperty:
def __init__(self, method: Any) -> None:
self.method = method
def __get__(self, instance: Any, cls: Any) -> Any:
return self.method(cls)
[docs]
class BaseMetric[Response](ABC):
NAME: str
KEYS: list[str] | None = None
# The aggregator determines how to aggregate the results of a metric for a single
# sample over multiple runs (LLM calls). We default to averaging and thus making
# macro averaging the overall computation default.
AGGREGATORS: list[Aggregator] = []
# Set by the evaluation generator before calculate(); controls how infra failures are handled.
fail_on_error: bool = True
@classproperty
def NAMES(cls) -> list[str]:
if cls.KEYS is None:
return [cls.NAME]
return [f"{cls.NAME}/{k}" for k in cls.KEYS]
[docs]
@abstractmethod
def calculate(self, response: Response) -> list[MetricResult]:
raise NotImplementedError
def _record_or_raise(self, exc: Exception) -> list[MetricResult]:
"""Infra failure (e.g. a Docker image-pull rate limit): abort when fail_on_error is set,
otherwise record a per-sample error so the run continues."""
if raise_errors() or self.fail_on_error:
raise exc
return [
MetricResult(
metric_name=self.NAME,
value=None,
higher_is_better=True,
error=Error(error_class=exc.__class__.__name__, message=str(exc), traceback=traceback.format_exc()),
)
]