Source code for eval_framework.metrics.completion.drop_completion
"""DROP completion metrics: F1 and exact match."""
from eval_framework.external.drop_process_results import process_results
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import BaseMetricContext, Completion, extract_context_metric
[docs]
class DropMetricContext(BaseMetricContext):
"""Context for DROP completion metrics. answer_tuples: list of gold answers (each a list of strings)."""
answer_tuples: list[list[str]]
[docs]
class DropF1ExactMatch(BaseMetric[Completion]):
"""DROP F1 and exact match. Requires DropMetricContext with answer_tuples."""
NAME = "DROP F1 / Exact Match"
KEYS = ["f1", "exact_match"]
[docs]
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [
MetricResult(metric_name=f"{self.NAME}/f1", value=None, higher_is_better=True, error=response.error),
MetricResult(
metric_name=f"{self.NAME}/exact_match", value=None, higher_is_better=True, error=response.error
),
]
context = extract_context_metric(response, DropMetricContext)
# Gold: list of tuples (stored as list of lists)
answer_tuples = [list(a) for a in context.answer_tuples]
# Parse completion: comma-separated spans or single string
raw = (response.completion or "").strip()
pred_spans = [s.strip() for s in raw.split(",") if s.strip()] if raw else []
if not pred_spans:
pred_spans = [raw]
doc = {"answers": answer_tuples}
results = [pred_spans]
out = process_results(doc, results)
return [
MetricResult(metric_name="DROP F1", value=out["f1"], higher_is_better=True, error=response.error),
MetricResult(
metric_name="Exact Match", value=out["exact_match"], higher_is_better=True, error=response.error
),
]