Source code for eval_framework.metrics.completion.bleu

import sacrebleu

from eval_framework.exceptions import LogicError
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion



[docs]
class BLEU(BaseMetric[Completion]):
    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
    for evaluating a generated sentence to a reference sentence. It counts matching
    n-grams in the candidate translation to n-grams in the reference text, where
    1-gram or unigram would be each token and a bigram comparison would be each
    word pair. The comparison is made regardless of word order
    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
    Paper: https://www.aclweb.org/anthology/P02-1040/
    """

    NAME = "BLEU"


[docs]
    def calculate(self, response: Completion) -> list[MetricResult]:
        if response.error is not None:
            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

        scores = []
        for ground_truth in response.ground_truth_list:
            if ground_truth == "" or ground_truth is None:
                raise LogicError("When calculating BLEU we need a ground truth.")

            sacre_formatted_completion = [response.completion]
            sacre_formatted_ground_truth = [[ground_truth]]
            scores.append(sacrebleu.corpus_bleu(sacre_formatted_completion, sacre_formatted_ground_truth).score)

        return [
            MetricResult(metric_name=self.NAME, value=float(max(scores)), higher_is_better=True, error=response.error)
        ]





[docs]
class LINEWISE_BLEU(BaseMetric[Completion]):
    """Maximum Line-level BLEU score."""

    NAME = "Linewise BLEU"


[docs]
    def calculate(self, response: Completion) -> list[MetricResult]:
        if response.error is not None:
            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

        scores = []
        for ground_truth in response.ground_truth_list:
            for sentence in response.completion.split("\n"):
                if sentence == "":
                    continue

                if ground_truth == "" or ground_truth is None:
                    raise LogicError("When calculating BLEU we need a ground truth.")

                sacre_formatted_completion = [sentence]
                sacre_formatted_ground_truth = [[ground_truth]]
                scores.append(sacrebleu.corpus_bleu(sacre_formatted_completion, sacre_formatted_ground_truth).score)

        return [
            MetricResult(
                metric_name=self.NAME, value=float(max(scores, default=0)), higher_is_better=True, error=response.error
            )
        ]





[docs]
class ResponseToOriginalBLEU(BaseMetric[Completion]):
    NAME = "Response to Original BLEU"


[docs]
    def calculate(self, response: Completion) -> list[MetricResult]:
        if response.error is not None:
            return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]

        score = sacrebleu.corpus_bleu([response.completion], [[response.last_user_instruction]]).score
        # scaled to [0, 1] to make aggregation easier
        return [MetricResult(metric_name=self.NAME, value=score / 100, higher_is_better=True, error=response.error)]