Source code for eval_framework.metrics.completion.ifeval

from typing import Any

from eval_framework.external.ifeval_impl.utils import process_results
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric


[docs] class IFEvalMetricContext(BaseMetricContext): key: int instruction_id_list: list[str] prompt: str additional_kwargs: list[dict[str, Any]]
[docs] class IFEvalMetric(BaseMetric[Completion]): NAME = "IFEval"
[docs] def calculate(self, response: Completion) -> list[MetricResult]: context = extract_context_metric(response, IFEvalMetricContext) if response.error is not None: return [ MetricResult( metric_name=f"{self.NAME}/prompt_level_strict_acc", value=None, higher_is_better=True, error=response.error, ), MetricResult( metric_name=f"{self.NAME}/prompt_level_loose_acc", value=None, higher_is_better=True, error=response.error, ), ] grading = process_results(context, [response.completion]) results = [ MetricResult( metric_name=f"{self.NAME}/prompt_level_strict_acc", value=float(grading["prompt_level_strict_acc"]), higher_is_better=True, error=response.error, ), MetricResult( metric_name=f"{self.NAME}/prompt_level_loose_acc", value=float(grading["prompt_level_loose_acc"]), higher_is_better=True, error=response.error, ), ] # this framework does not support a custom aggregation step (see agg_inst_level_acc()) so work around # by returning the result for each instruction as a separate MetricResult results += [ MetricResult( metric_name=f"{self.NAME}/inst_level_strict_acc", value=float(v), higher_is_better=True, error=response.error, ) for v in grading["inst_level_strict_acc"] ] results += [ MetricResult( metric_name=f"{self.NAME}/inst_level_loose_acc", value=float(v), higher_is_better=True, error=response.error, ) for v in grading["inst_level_loose_acc"] ] return results