Source code for eval_framework.metrics.completion.comet
import torch
from comet import download_model, load_from_checkpoint
from eval_framework.exceptions import LogicError
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion, UntemplatedPrompt
from eval_framework.utils.constants import ROOT_DIR
SAVING_DIR = ROOT_DIR / "comet_model"
[docs]
class COMET(BaseMetric[Completion]):
"""COMET is a neural, multilingual framework for evaluating machine translation quality by leveraging cross-lingual
pretrained language models to achieve state-of-the-art correlation with human judgments
Note: this requires a Hugging Face token with access to the model: https://huggingface.co/Unbabel/XCOMET-XL
Source: https://github.com/Unbabel/COMET
Paper: https://arxiv.org/abs/2009.09025
"""
NAME = "COMET"
def __init__(self) -> None:
checkpoint_path = download_model("Unbabel/XCOMET-XL", saving_directory=SAVING_DIR)
self.model = load_from_checkpoint(checkpoint_path)
assert torch.cuda.is_available(), "COMET requires a GPU"
[docs]
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
if (
response.context is None
or not isinstance(response.context, UntemplatedPrompt)
or response.context.untemplated_prompt == ""
):
raise LogicError("When calculating COMET we need an untemplated prompt.")
scores = []
for ground_truth in response.ground_truth_list:
if ground_truth == "" or ground_truth is None:
raise LogicError("When calculating COMET we need a ground truth.")
data = [
{
"src": response.context.untemplated_prompt.strip(),
"mt": response.completion.strip(),
"ref": ground_truth.strip(),
},
]
with torch.no_grad():
model_output = self.model.predict(data, gpus=1)
scores.append(model_output.system_score)
return [
MetricResult(metric_name=self.NAME, value=float(max(scores)), higher_is_better=True, error=response.error)
]