Source code for eval_framework.metrics.completion.cwe_accuracy
import re
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion, Error
[docs]
class CWEAccuracy(BaseMetric[Completion]):
"""Metric for Common Word Extraction tasks"""
NAME = "CWEAccuracy"
[docs]
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
ground_truths = [gt for gt in response.ground_truth_list if gt is not None]
if not ground_truths:
return [MetricResult(metric_name=self.NAME, value=0.0, higher_is_better=True, error=response.error)]
try:
# Get model's answer
model_answer = response.completion
# Check if all words in the correct answer are present in the model's answer
is_correct = self._is_answer_correct(ground_truths, model_answer)
return [
MetricResult(
metric_name=self.NAME, value=1.0 if is_correct else 0.0, higher_is_better=True, error=response.error
)
]
except Exception as e:
error = Error(error_class=e.__class__.__name__, message=str(e), traceback="")
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=error)]
def _is_answer_correct(self, correct_answer: list[str], model_answer: str) -> bool:
"""Check if all words in correct_answer are present in model_answer as whole words"""
model_answer = model_answer.strip().lower()
correct_answer = [correct.strip().lower() for correct in correct_answer]
# For each word in the correct answer, check if it exists as a whole word in the model answer
for word in correct_answer:
# Create a regex pattern that matches the word as a whole word
# \b represents a word boundary
pattern = r"\b" + re.escape(word) + r"\b"
if not re.search(pattern, model_answer):
return False
return True