Source code for eval_framework.metrics.completion.code_assertion
from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion, Error
from eval_framework.tasks.utils import run_python_code
[docs]
class CodeCompletionAssertion(BaseMetric[Completion]):
NAME = "Code Completion Accuracy"
[docs]
def calculate(self, response: Completion) -> list[MetricResult]:
if response.error is not None:
return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)]
# this will always be a list, if return is "" this will be an empty list
code = response.completion
output = run_python_code(code, image="python:3.12-slim")
# Split and filter out empty strings
output_parts = [part for part in output.split() if part.strip()]
if not output_parts:
last_output = ""
else:
last_output = output_parts[-1]
success = last_output == "True"
error = (
None
if success
else Error(
error_class="CodeCompletionAssertionError",
message=f"Expected 'True' but got '{last_output}'",
traceback=output,
)
)
return [
MetricResult(
metric_name=self.NAME,
value=1.0 if success else 0.0,
higher_is_better=True,
error=error,
code_execution_trace=output,
)
]