Source code for eval_framework.metrics.completion.comet

import torch
from comet import download_model, load_from_checkpoint

from eval_framework.exceptions import LogicError
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion, UntemplatedPrompt
from eval_framework.utils.constants import ROOT_DIR

SAVING_DIR = ROOT_DIR / "comet_model"


[docs] class COMET(BaseMetric[Completion]): """COMET is a neural, multilingual framework for evaluating machine translation quality by leveraging cross-lingual pretrained language models to achieve state-of-the-art correlation with human judgments Note: this requires a Hugging Face token with access to the model: https://huggingface.co/Unbabel/XCOMET-XL Source: https://github.com/Unbabel/COMET Paper: https://arxiv.org/abs/2009.09025 """ NAME = "COMET" def __init__(self) -> None: checkpoint_path = download_model("Unbabel/XCOMET-XL", saving_directory=SAVING_DIR) self.model = load_from_checkpoint(checkpoint_path) assert torch.cuda.is_available(), "COMET requires a GPU"
[docs] def calculate(self, response: Completion) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] if ( response.context is None or not isinstance(response.context, UntemplatedPrompt) or response.context.untemplated_prompt == "" ): raise LogicError("When calculating COMET we need an untemplated prompt.") scores = [] for ground_truth in response.ground_truth_list: if ground_truth == "" or ground_truth is None: raise LogicError("When calculating COMET we need a ground truth.") data = [ { "src": response.context.untemplated_prompt.strip(), "mt": response.completion.strip(), "ref": ground_truth.strip(), }, ] with torch.no_grad(): model_output = self.model.predict(data, gpus=1) scores.append(model_output.system_score) return [ MetricResult(metric_name=self.NAME, value=float(max(scores)), higher_is_better=True, error=response.error) ]