Source code for eval_framework.tasks.benchmarks.humaneval
from typing import Any
from eval_framework.metrics.completion.code_assertion import CodeCompletionAssertion
from eval_framework.shared.types import BaseMetricContext
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample
CODE_TO_EXECUTE = """
{start_of_code}
{completion_text}
{test_code}
try:
check({entry_point})
print(True)
except Exception as e:
print(e)
print(False)
"""
[docs]
class HumanEvalMetricContext(BaseMetricContext):
test: str
entry_point: str
prompt: str
[docs]
class HumanEval(BaseTask[str]):
"""HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval/"""
NAME = "Human Eval"
DATASET_PATH = "openai/openai_humaneval"
SAMPLE_SPLIT = "test"
FEWSHOT_SPLIT = "test" # (there is no dedicated split, few-shot is not expected for this dataset)
RESPONSE_TYPE = ResponseType.COMPLETION
METRICS = [CodeCompletionAssertion]
SUBJECTS = [NO_SUBJECT]
LANGUAGE = Language.ENG
def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
self.stop_sequences: list[str] = ["```"]
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return f"```python\n{item['prompt'].lstrip()}"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return "Success"
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
return item["canonical_solution"]
def _get_context(self, item: dict[str, Any]) -> HumanEvalMetricContext:
return HumanEvalMetricContext(
test=item["test"],
entry_point=item["entry_point"],
prompt=item["prompt"],
)
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
assert sample is not None and sample.context is not None
assert isinstance(sample.context, HumanEvalMetricContext), "Expected HumanEvalMetricContext"
context = sample.context
for stop_sequence in self.stop_sequences:
if stop_sequence in completion_text:
completion_text = completion_text.split(stop_sequence)[0]
entry_point = context.entry_point
test_code = context.test
start_of_code = context.prompt
formatted_code = CODE_TO_EXECUTE.format(
start_of_code=start_of_code,
completion_text=completion_text,
test_code=test_code,
entry_point=entry_point,
)
return formatted_code
[docs]
class HumanEvalInstruct(HumanEval):
# See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/humaneval/humaneval_instruct.yaml
NAME = "Human Eval Instruct"
CUE_PREFIX = "Here is the completed function:\n```python\n"
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "Fewshot is not supported for Human Eval Instruct"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
instruction_text = (
"Write a solution to the following problem and make sure that "
f"it passes the tests:\n```python\n{item['prompt'].lstrip()}"
)
return instruction_text
def _get_cue_text(self, item: dict[str, Any]) -> str:
return self.CUE_PREFIX + item["prompt"].lstrip()