Source code for eval_framework.metrics.completion.code_assertion

from llm_sandbox.exceptions import SandboxTimeoutError

from eval_framework.metrics.base import BaseMetric, MetricResult
from eval_framework.shared.types import Completion
from eval_framework.tasks.utils import run_python_code


[docs] class CodeCompletionAssertion(BaseMetric[Completion]): NAME = "Code Completion Accuracy"
[docs] def calculate(self, response: Completion) -> list[MetricResult]: if response.error is not None: return [MetricResult(metric_name=self.NAME, value=None, higher_is_better=True, error=response.error)] # this will always be a list, if return is "" this will be an empty list code = response.completion try: output = run_python_code(code, image="python:3.12-slim") except SandboxTimeoutError: # The submitted code timed out (e.g. an infinite loop) -- a failing sample, not an infra # problem. import traceback return [ MetricResult( metric_name=self.NAME, value=0.0, higher_is_better=True, code_execution_trace=traceback.format_exc(), ) ] except Exception as e: # Any other sandbox/Docker error (e.g. an image pull rate limit) is an infra failure. return self._record_or_raise(e) # Split and filter out empty strings output_parts = [part for part in output.split() if part.strip()] if not output_parts: last_output = "" else: last_output = output_parts[-1] success = last_output == "True" return [ MetricResult( metric_name=self.NAME, value=1.0 if success else 0.0, higher_is_better=True, error=None, code_execution_trace=output, ) ]