Source code for eval_framework.metrics.llm.llm_judge_instruction

from eval_framework.llm.base import BaseLLM
from eval_framework.metrics.base import MetricResult
from eval_framework.metrics.llm.base import BaseLLMJudgeMetric
from eval_framework.metrics.llm.graders.instruction_grader import InstructionGrader
from eval_framework.metrics.llm.graders.language import Language
from eval_framework.shared.types import Completion


[docs] class LLMJudgeInstruction(BaseLLMJudgeMetric): NAME = "Instruction Following" KEYS = [ "quality", "is_following_instruction", "has_correct_grammar_and_spelling", "is_context_consistent", "is_not_repeating", "is_trustworthy", "is_safe", ] def __init__(self, llm_judge: BaseLLM): super().__init__(llm_judge) self._grader = InstructionGrader(llm_judge)
[docs] def calculate(self, response: Completion) -> list[MetricResult]: if response.error is not None: for key in self.KEYS: return [ MetricResult( metric_name=f"{self.NAME} - {key}", value=None, higher_is_better=True, error=response.error ) ] language = Language(response.get_instruction_language()) grading = self._grader.grade( instruction=response.system_user_instruction, completion=response.sanitized_completion, language=language, ) results = [] for key in self.KEYS: if key == "quality": # [0, 1] normalization required for visualizer value = (float(getattr(grading, key)) - 1) / 4 if getattr(grading, key) is not None else None else: value = float(getattr(grading, key)) if getattr(grading, key) is not None else None result = MetricResult( metric_name=f"{self.NAME}/{key}", value=value, higher_is_better=True, llm_judge_prompt=grading.judge_prompt, llm_judge_response=grading.judge_response, error=response.error, ) results.append(result) return results