Source code for eval_framework.metrics.completion.f1

from collections import Counter
from typing import Any

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion


[docs] class F1(BaseMetric[Completion]): NAME = "F1"
[docs] def calculate(self, response: Completion) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] ground_truths = [gt for gt in response.ground_truth_list if gt is not None] if not ground_truths: return [MetricResult(metric_name=self.NAME, value=0.0, higher_is_better=True, error=response.error)] hyp_tokens = response.completion.lower().split() f1_scores = [calculate_f1(gt.lower().split(), hyp_tokens) for gt in ground_truths] max_f1 = max(f1_scores) return [MetricResult(metric_name=self.NAME, value=max_f1, higher_is_better=True, error=response.error)]
[docs] def calculate_f1(ref_tokens: list[Any], hyp_tokens: list[Any]) -> float: """Calculate F1 score between two texts based on token overlap.""" if not ref_tokens and not hyp_tokens: return 1.0 if not ref_tokens or not hyp_tokens: return 0.0 common = Counter(ref_tokens) & Counter(hyp_tokens) num_same = sum(common.values()) if num_same == 0: return 0.0 precision = num_same / len(hyp_tokens) recall = num_same / len(ref_tokens) return 2 * precision * recall / (precision + recall)