Source code for eval_framework.metrics.llm.llm_judge_completion_accuracy
from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.metrics.llm.graders.long_context_grader import LongContextGrader
from eval_framework.shared.types import Completion
[docs]
class LLMJudgeCompletionAccuracy(BaseLLMJudgeMetric):
NAME = "Judge Completion Accuracy"
def __init__(self, llm_judge: BaseLLM):
super().__init__(llm_judge)
self._grader = LongContextGrader(llm_judge)
[docs]
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
assert isinstance(response.ground_truth, str)
language = Language(response.get_instruction_language())
grading = self._grader.grade(
expected_output=response.ground_truth,
completion=response.sanitized_completion,
language=language,
)
return [
MetricResult(
metric_name=self.NAME,
value=float(grading.answer_is_correct) if grading.answer_is_correct is not None else None,
higher_is_better=True,
llm_judge_prompt=grading.judge_prompt,
llm_judge_response=grading.judge_response,
error=response.error,
)
]