Source code for eval_framework.tasks.benchmarks.bigcodebench

import random
import re
from typing import Any

from eval_framework.metrics.completion.code_execution_pass_at_one import (
    CodeExecutionPassAtOne,
    CodeExecutionPassAtOneContext,
)
from eval_framework.tasks.base import (
    RANDOM_SEED,
    BaseTask,
    Language,
    ResponseType,
    Sample,
    SubjectType,
)
from eval_framework.tasks.utils import (
    BIG_CODE_BENCH_PACKAGE_MAPPING,
    CallableSerializer,
    _parse_unittest_output,
    unittest_merge_snippets,
)

PROMPT_INSTRUCTION = (
    "Please provide a self-contained Python script, without tests or example usage, that solves the following "
    "problem in a markdown code block:\n"
)  # from https://arxiv.org/pdf/2406.15877 - Figure 14


RESPONSE_PREFIX = (
    "Below is a Python script with a self-contained function that solves the problem and passes "
    "corresponding tests:\n"
)  # from https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py#L149



[docs]
class BigCodeBench(BaseTask[str]):
    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""

    NAME = "BigCodeBench"
    DATASET_PATH = "bigcode/bigcodebench"
    SAMPLE_SPLIT = "v0.1.4"
    FEWSHOT_SPLIT = "v0.1.4"  # (there is no dedicated split, few-shot is not expected for this dataset)
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [CodeExecutionPassAtOne]
    SUBJECTS = ["original", "calibrated"]
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        assert num_fewshot == 0, "Fewshot is not supported for BigCodeBench"
        # NOTE : this serializer should be the same class as initialized in the metric
        self.serializer = CallableSerializer()
        super().__init__(num_fewshot)

    def _load_dataset(self, subject: SubjectType) -> None:
        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
        self.dataset = {}

        self.rnd = random.Random(RANDOM_SEED)

        for split, data in hf_dataset.items():
            data_list = list(data)

            if split == self.SAMPLE_SPLIT:
                self.rnd.shuffle(data_list)

            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
                self.dataset[split] = data_list

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return PROMPT_INSTRUCTION + item["complete_prompt"]

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return RESPONSE_PREFIX + (item["code_prompt"] if item["subject"] == "calibrated" else "")

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        return item["canonical_solution"]  # Not needed for evaluation, as it is test based given the generated code

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return None

    def _get_context(self, item: dict[str, Any]) -> CodeExecutionPassAtOneContext:
        return CodeExecutionPassAtOneContext(
            run_env="python:3.12",  # os.environ.get("DOCKER_CODE_EXECUTION"),
            code_prompt=item["code_prompt"],
            test_code=item["test"],
            snippet_merge_fn=self.serializer.encode(unittest_merge_snippets),
            output_parse_fn=self.serializer.encode(_parse_unittest_output),
            package_downloads=BIG_CODE_BENCH_PACKAGE_MAPPING,
        )


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        if sample is not None and sample.context is not None and sample.subject == "calibrated":
            assert isinstance(sample.context, CodeExecutionPassAtOneContext), "Expected CodeExecutionPassAtOneContext"
            processed_text = (sample.context.code_prompt if sample.context is not None else "") + completion_text
        else:
            processed_text = extract_executable_code(completion_text)

        return processed_text





[docs]
class BigCodeBenchInstruct(BigCodeBench):
    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""

    NAME = "BigCodeBenchInstruct"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return PROMPT_INSTRUCTION + item["instruct_prompt"]




[docs]
class BigCodeBenchHard(BigCodeBench):
    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""

    NAME = "BigCodeBenchHard"
    DATASET_PATH = "bigcode/bigcodebench-hard"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return PROMPT_INSTRUCTION + item["complete_prompt"]




[docs]
class BigCodeBenchHardInstruct(BigCodeBenchHard):
    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""

    NAME = "BigCodeBenchHardInstruct"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return PROMPT_INSTRUCTION + item["instruct_prompt"]




[docs]
def extract_executable_code(llm_response: str) -> str:
    # Look for nested markdown+python pattern
    nested_pattern = r"```markdown.*?```python\s*(.*?)\s*```"
    nested_matches = re.findall(nested_pattern, llm_response, re.DOTALL)
    if nested_matches:
        return nested_matches[0].strip()

    # Look for python code blocks
    python_pattern = r"```python\s*(.*?)\s*```"
    python_matches = re.findall(python_pattern, llm_response, re.DOTALL)
    if python_matches:
        return python_matches[0].strip()

    # Look for markdown-only code blocks
    markdown_pattern = r"```markdown\s*(.*?)\s*```"
    markdown_matches = re.findall(markdown_pattern, llm_response, re.DOTALL)
    if markdown_matches:
        return markdown_matches[0].strip()

    # Look for generic code blocks as fallback
    generic_pattern = r"```\s*(.*?)\s*```"
    generic_matches = re.findall(generic_pattern, llm_response, re.DOTALL)
    if generic_matches:
        return generic_matches[0].strip()

    # If no code blocks found, return original response
    return llm_response