import logging
import random
import re
from typing import Any
from eval_framework.metrics.completion.code_execution_pass_at_one import (
CodeExecutionPassAtOne,
CodeExecutionPassAtOneContext,
CodeExecutionPassAtOneWithCodebench,
)
from eval_framework.tasks.base import (
RANDOM_SEED,
BaseTask,
Language,
ResponseType,
Sample,
SubjectType,
)
from eval_framework.tasks.utils import (
BIG_CODE_BENCH_PACKAGE_MAPPING,
CallableSerializer,
_parse_unittest_output,
unittest_merge_snippets,
)
logger = logging.getLogger(__name__)
PROMPT_INSTRUCTION = (
"Please provide a self-contained Python script, without tests or example usage, that solves the following "
"problem in a markdown code block:\n"
) # from https://arxiv.org/pdf/2406.15877 - Figure 14
RESPONSE_PREFIX = (
"Below is a Python script with a self-contained function that solves the problem and passes "
"corresponding tests:\n"
) # from https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py#L149
[docs]
class BigCodeBench(BaseTask[str]):
"""BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
NAME = "BigCodeBench"
DATASET_PATH = "bigcode/bigcodebench"
SAMPLE_SPLIT = "v0.1.4"
FEWSHOT_SPLIT = "v0.1.4" # (there is no dedicated split, few-shot is not expected for this dataset)
RESPONSE_TYPE = ResponseType.COMPLETION
METRICS = [CodeExecutionPassAtOne]
SUBJECTS = ["original", "calibrated"]
LANGUAGE = Language.ENG
def __init__(self, num_fewshot: int = 0) -> None:
if self.__class__ is BigCodeBench and num_fewshot != 0:
logger.warning(
"Fewshot is not supported for BigCodeBench (got num_fewshot=%d); "
"setting to 0. Use BigCodeBench_OLMES for 3-shot.",
num_fewshot,
)
num_fewshot = 0
# NOTE : this serializer should be the same class as initialized in the metric
self.serializer = CallableSerializer()
super().__init__(num_fewshot)
def _load_dataset(self, subject: SubjectType) -> None:
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
self.dataset = {}
self.rnd = random.Random(RANDOM_SEED)
for split, data in hf_dataset.items():
data_list = list(data)
if split == self.SAMPLE_SPLIT:
self.rnd.shuffle(data_list)
if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
self.dataset[split] = data_list
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return PROMPT_INSTRUCTION + item["complete_prompt"]
def _get_cue_text(self, item: dict[str, Any]) -> str:
return RESPONSE_PREFIX + (item["code_prompt"] if item["subject"] == "calibrated" else "")
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return item["canonical_solution"] # Not needed for evaluation, as it is test based given the generated code
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
return None
def _get_context(self, item: dict[str, Any]) -> CodeExecutionPassAtOneContext:
return CodeExecutionPassAtOneContext(
run_env="python:3.12", # os.environ.get("DOCKER_CODE_EXECUTION"),
code_prompt=item["code_prompt"],
test_code=item["test"],
snippet_merge_fn=self.serializer.encode(unittest_merge_snippets),
output_parse_fn=self.serializer.encode(_parse_unittest_output),
package_downloads=BIG_CODE_BENCH_PACKAGE_MAPPING,
)
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
if sample is not None and sample.context is not None and sample.subject == "calibrated":
assert isinstance(sample.context, CodeExecutionPassAtOneContext), "Expected CodeExecutionPassAtOneContext"
processed_text = (sample.context.code_prompt if sample.context is not None else "") + completion_text
else:
processed_text = extract_executable_code(completion_text)
return processed_text
# Instruction and prompt format matching oe_eval bigcodebench:3shot::olmo3:v2 (complete variant).
# See oe_eval/tasks/oe_eval_tasks/codex_bigcodebench.py doc_to_text().
PROMPT_INSTRUCTION_OLMES = (
"Please provide a self-contained Python script that solves the following problem in a markdown code block:"
)
[docs]
class BigCodeBench_OLMES(BigCodeBench):
"""
BigCodeBench variant matching oe_eval `bigcodebench:3shot::olmo3:v2`.
Recommended run settings for parity with oe_eval: temperature=0.6, top_p=0.6, repeats=5 (n=5),
then compute pass@1 over the 5 samples per problem (post-process if needed).
"""
NAME = "BigCodeBench_OLMES"
SAMPLE_SPLIT = "v0.1.2"
FEWSHOT_SPLIT = "v0.1.2"
METRICS = [CodeExecutionPassAtOneWithCodebench]
def __init__(self, num_fewshot: int = 3) -> None:
# Default 3-shot; config can override. Enforce 3 for this variant.
super().__init__(num_fewshot=3)
self.stop_sequences = [
"<|endoftext|>",
"<|endofmask|>",
"</s>",
"\nif __name__",
"\ndef main(",
"\nprint(",
"\ndef ",
"\nclass ",
"\nimport ",
"\nfrom ",
"\nassert ",
"\nPlease",
]
def _get_instruction_text(self, item: dict[str, Any]) -> str:
# Match oe_eval doc_to_text for prompt_variant "complete".
return PROMPT_INSTRUCTION_OLMES + "\n```\n" + item["complete_prompt"].strip() + "\n"
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
# Match oe_eval doc_to_target for complete: canonical_solution + "\\n```"
target = item["canonical_solution"]
if not isinstance(target, str):
raise ValueError(f"Expected canonical_solution to be a non-None str, got {type(target)}")
return target + "\n```"
def _get_cue_text(self, item: dict[str, Any]) -> str:
# Olmes does not use a cue text
return ""
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
if sample is not None and sample.context is not None:
assert isinstance(sample.context, CodeExecutionPassAtOneContext), "Expected CodeExecutionPassAtOneContext"
processed_text = (
sample.context.code_prompt if sample.context is not None else ""
) + completion_text.replace("```python", "").replace("```", "")
else:
processed_text = extract_executable_code(completion_text)
return processed_text
[docs]
class BigCodeBenchInstruct(BigCodeBench):
"""BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
NAME = "BigCodeBenchInstruct"
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return PROMPT_INSTRUCTION + item["instruct_prompt"]
[docs]
class BigCodeBenchHard(BigCodeBench):
"""BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
NAME = "BigCodeBenchHard"
DATASET_PATH = "bigcode/bigcodebench-hard"
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return PROMPT_INSTRUCTION + item["complete_prompt"]
[docs]
class BigCodeBenchHardInstruct(BigCodeBenchHard):
"""BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
NAME = "BigCodeBenchHardInstruct"
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return PROMPT_INSTRUCTION + item["instruct_prompt"]