Source code for eval_framework.metrics.completion.drop_completion

"""DROP completion metrics: F1 and exact match."""

from eval_framework.external.drop_process_results import process_results
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import (
    BaseMetricContext,
    Completion,
    extract_context_metric,
)



[docs]
class DropMetricContext(BaseMetricContext):
    """Context for DROP completion metrics. answer_tuples: list of gold answers (each a list of strings)."""

    answer_tuples: list[list[str]]




[docs]
class DropF1ExactMatch(BaseMetric[Completion]):
    """DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""

    NAME = "Drop F1"
    KEYS = ["f1", "exact_match"]


[docs]
    def calculate(self, response: Completion) -> list[MetricResult]:
        if response.error is not None:
            return [
                MetricResult(
                    metric_name=name,
                    value=None,
                    higher_is_better=True,
                    error=response.error,
                )
                for name in [n.strip() for n in self.NAME.split("/")]
            ]

        context = extract_context_metric(response, DropMetricContext)
        # Gold: list of tuples (stored as list of lists)
        answer_tuples = [list(a) for a in context.answer_tuples]
        # Parse completion: comma-separated spans or single string
        raw = (response.completion or "").strip()
        pred_spans = [s.strip() for s in raw.split(",") if s.strip()] if raw else []
        if not pred_spans:
            pred_spans = [raw]

        doc = {"answers": answer_tuples}
        out = process_results(doc, pred_spans)

        return [
            MetricResult(
                metric_name=name,
                value=out[key],
                higher_is_better=True,
                error=response.error,
            )
            for name, key in zip(self.NAMES, self.KEYS)
        ]