Source code for eval_framework.metrics.completion.math_minerva_completion

"""
Minerva-style MATH completion metric: exact_match and exact_match_flex.
"""

from eval_framework.metrics.aggregators.aggregators import PassAtK
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.metrics.completion.minerva_math_utils import (
    extract_answers,
    is_equiv_hendrycks,
    is_equiv_minerva,
)
from eval_framework.shared.types import Completion



[docs]
class MathMinervaCompletion(BaseMetric[Completion]):
    """
    Minerva MATH: reports Exact Match and Exact Match (Flex).
    Uses raw_completion to extract multiple candidates; primary for exact_match,
    all candidates with both Minerva and Hendrycks equivalence for exact_match_flex.

    English Minerva extraction is the default. Subclasses select other
    final-answer styles by overriding ``COT_STYLE`` / ``RELAXED``.
    """

    NAME = "Math Minerva Completion"
    KEYS = ["Exact", "Exact Flex"]
    AGGREGATORS = [PassAtK()]

    # Defaults; subclasses override these class attributes to define variants.
    COT_STYLE: str = "minerva"
    RELAXED: bool = False

    def __init__(
        self,
        use_cot: bool = True,
        cot_style: str | None = None,
        relaxed: bool | None = None,
    ) -> None:
        self.use_cot = use_cot
        self.cot_style = cot_style if cot_style is not None else self.COT_STYLE
        self.relaxed = relaxed if relaxed is not None else self.RELAXED


[docs]
    def calculate(self, response: Completion) -> list[MetricResult]:
        if response.error:
            return [
                MetricResult(
                    metric_name=x,
                    value=None,
                    higher_is_better=True,
                    error=response.error,
                )
                for x in self.NAMES
            ]

        gold = response.ground_truth
        if isinstance(gold, list):
            gold = gold[0] if gold else None
        if not gold:
            return [
                MetricResult(
                    metric_name=x,
                    value=None,
                    higher_is_better=True,
                    error="No ground truth available",
                )
                for x in self.NAMES
            ]

        raw = response.raw_completion or response.completion
        all_candidates = extract_answers(
            raw,
            use_cot=self.use_cot,
            cot_style=self.cot_style,
            relaxed=self.relaxed,
        )

        exact_match = 0.0
        if all_candidates:
            primary = all_candidates[0]
            if is_equiv_minerva(primary, gold):
                exact_match = 1.0

        exact_match_flex = float(
            any(
                is_equiv_minerva(candidate, gold) or is_equiv_hendrycks(candidate, gold) for candidate in all_candidates
            )
        )

        return [
            MetricResult(metric_name=name, value=value, higher_is_better=True)
            for name, value in zip(self.NAMES, [exact_match, exact_match_flex])
        ]





[docs]
class MathMinervaCompletionRelaxed(MathMinervaCompletion):
    """MathMinervaCompletion with relaxed=True by default (flexible final-answer matching)."""

    NAME = "Math Minerva Completion Relaxed"
    RELAXED = True




[docs]
class MathMinervaCompletionDE(MathMinervaCompletion):
    """MathMinervaCompletion with German final-answer extraction (``Finale Antwort: …``)."""

    NAME = "Math Minerva Completion DE"
    COT_STYLE = "minerva_de"




[docs]
class MathMinervaCompletionRelaxedDE(MathMinervaCompletionDE):
    """MathMinervaCompletionDE with relaxed=True by default."""

    NAME = "Math Minerva Completion Relaxed DE"
    RELAXED = True