Source code for eval_framework.tasks.benchmarks.mbpp

import ast
import logging
import re
from typing import Any

from eval_framework.metrics.completion.code_assertion import (
    CodeCompletionAssertion,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.shared.types import BaseMetricContext
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample

logger = logging.getLogger(__name__)

BEGIN = "```python"
END = "```"



[docs]
class MBPPMetricContext(BaseMetricContext):
    tests_code: str




[docs]
class MBPP(BaseTask[str]):
    """
    MBPP provides both the problem statement and the test cases upfront. It says, "Here's the problem and here are the
    tests; write code that passes them.". Note that LLMs can cheat and only write code that passes the tests without
    solving the given problem.

    MBPP_PROMPT_WITHOUT_TESTS, on the other hand, only gives you the problem statement and function signature
    initially. It says, "Here's the problem and function signature; write code, then we'll run tests later."
    """

    NAME = "MBPP"
    DATASET_PATH = "google-research-datasets/mbpp"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [CodeCompletionAssertion]
    SUBJECTS = ["full"]  # , "sanitized"]  # these are HF dataset SUBSETS!
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences = [END]

    @staticmethod
    def _code_expander(code: str, gt_asserts: str) -> str:
        """
        code variable carries the LLM-generated code snippet. We append the asserts for code testing
        here. If no valid code is found in the LLM output, this function is not called.
        Important: gt_asserts come as a stringiied list of assert strings. We safely reconvert this string
        back to the list of of individual assert statements (also strings) by ast.literal_eval
        """
        if not gt_asserts:  # no ground truth (data asserts) are given, we return the original code
            return code
        gt_asserts = ast.literal_eval(gt_asserts)  # never use eval!
        if not isinstance(gt_asserts, list):
            logger.info("*** WARNING, we expect a list of ground truth asserts here! Sample can not be finalized")
            return code
        postfix = ""
        stacked_asserts = ""
        for gt_assert in gt_asserts:
            stacked_asserts += "    " + gt_assert + "\n"
        postfix = "try:\n" + stacked_asserts + "    score = True\nexcept:\n    score = False\nprint(score)"
        return code + postfix

    @staticmethod
    def _get_function_name(line: str) -> str:
        match = re.search(r"def\s+(\w+)\s*\(", line)
        function_name = ""
        if match:
            function_name = match.group(1)
        return function_name

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        """
        Passing selected task and tests depending on zero or few-shot setting
        """
        tests = "\n".join(item["test_list"])
        text = item["text"] if "text" in item else item["prompt"]

        instruction_text = f"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n{tests}\n"  # noqa E501
        return instruction_text

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return BEGIN

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        """
        asserts are being passed as ground_truth, this is expected by CodeCompletionAssertion metrics
        """
        return f"{item['test_list']}"

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        target = item["code"]
        assert target is not None
        assert isinstance(target, str)
        return f"{BEGIN}\n" + target + f"\n{END}"

    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
        fewshot_examples = self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot)
        return fewshot_examples

    def _get_context(self, item: dict[str, Any]) -> MBPPMetricContext:
        return MBPPMetricContext(tests_code="\n".join(item["test_list"]))


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str:  # type: ignore[override]
        if BEGIN in completion_text:
            completion_text = completion_text.split(f"{BEGIN}\n")[1]

        if END in completion_text:
            completion_text = completion_text.split(END)[0]

        extracted_code = completion_text + "\n"
        mbpp_ground_truth = str(sample.ground_truth)
        code = self._code_expander(extracted_code, mbpp_ground_truth)
        return code





[docs]
class MBPPBPB(MBPP):
    """
    MBPP variant that scores loglikelihood of the gold reference code.
    Reports bits-per-byte on the reference solution.
    """

    NAME = "MBPP BPB"
    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [BitsPerByteLoglikelihood]

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        code = item.get("code")
        if not code:
            return None
        return " " + code

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        gt = self._get_ground_truth(item)
        return [gt] if gt else None




[docs]
class MBPP_SANITIZED(MBPP):
    NAME = "MBPP_SANITZED"
    SUBJECTS = ["sanitized"]




[docs]
class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
    """
    MBPP provides both the problem statement and the test cases upfront. It says, "Here's the problem and here are the
    tests; write code that passes them.". Note that LLMs can cheat and only write code that passes the tests without
    solving the given problem.

    MBPP_PROMPT_WITHOUT_TESTS, on the other hand, only gives you the problem statement and function signature
    initially. It says, "Here's the problem and function signature; write code, then we'll run tests later."
    """

    NAME = "MBPP_PROMPT_WITHOUT_TESTS"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        """
        Passing selected task and tests depending on zero or few-shot setting
        """
        text = item["text"] if "text" in item else item["prompt"]
        instruction_text = f"You are an expert Python programmer, and here is your task: {text}\n\n"  # noqa E501
        return instruction_text

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        function_header = self._get_function_header(item["code"])
        return f"{BEGIN}\n{function_header}"

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        target = item["code"]
        assert target is not None
        assert isinstance(target, str)
        return f"{BEGIN}\n" + target + f"\n{END}"

    @staticmethod
    def _get_function_header(line: str) -> str:
        match = re.search(r"^\s*def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(.*?\)\s*:", line, re.MULTILINE)
        postfix = ""
        if match is not None:  # extract up to next open parenthesis in the found substring
            postfix = line[match.start() :]
            match = re.search(r"\)", postfix)
            if match is not None:
                end = match.start()
                postfix = postfix[: end + 1]
            else:
                postfix = ""

        if postfix == "":
            return postfix
        return f"{postfix.strip()}:"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str:  # type: ignore[override]
        if BEGIN in completion_text:
            completion_text = completion_text.split(BEGIN)[1]

        if END in completion_text:
            completion_text = completion_text.split(END)[0]

        extracted_code = completion_text + "\n"
        mbpp_ground_truth = str(sample.ground_truth)
        function_header = self._get_function_header(sample.messages[-1].content)
        code = self._code_expander(extracted_code, mbpp_ground_truth)
        return function_header + code





[docs]
class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
    NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
    SUBJECTS = ["sanitized"]



_OLMES_FEWSHOT_EXAMPLES: list[dict[str, Any]] = [
    {
        "text": "Write a function to find the similar elements from the given two tuple lists.",
        "code": (
            "def similar_elements(test_tup1, test_tup2):\n"
            "  res = tuple(set(test_tup1) & set(test_tup2))\n  return (res)"
        ),
        "test_list": [
            "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
            "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
            "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
        ],
    },
    {
        "text": "Write a python function to identify non-prime numbers.",
        "code": (
            "import math\ndef is_not_prime(n):\n    result = False\n"
            "    for i in range(2,int(math.sqrt(n)) + 1):\n"
            "        if n % i == 0:\n            result = True\n    return result"
        ),
        "test_list": [
            "assert is_not_prime(2) == False",
            "assert is_not_prime(10) == True",
            "assert is_not_prime(35) == True",
        ],
    },
    {
        "text": (
            "Write a function to find the largest integers from a given list of numbers using heap queue algorithm."
        ),
        "code": (
            "import heapq as hq\ndef heap_queue_largest(nums,n):\n"
            "  largest_nums = hq.nlargest(n, nums)\n  return largest_nums"
        ),
        "test_list": [
            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
            "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
        ],
    },
]



[docs]
class MBPP_OLMES(MBPP):
    """
    MBPP OLMES variant replicating oe_eval's ``mbpp:3shot::olmo3:n32:v2``.

    Uses the EvalPlus prompt format with 3 hardcoded fewshot examples from the
    original MBPP "prompt" split (matching oe_eval's ordering). Each prompt
    shows one test case (the first) instead of all.

    Recommended EvalConfig settings for full replication::

        split: test
        num_fewshot: 3 (hardcoded, prompt split)
        metric: pass_at_1
        temperature: 0.6
        top_p: 0.6
        repeats: 32
    """

    NAME = "MBPP_OLMES"
    FEWSHOT_SPLIT = "test"

    def __init__(self, num_fewshot: int = 3) -> None:
        super().__init__(num_fewshot)
        assert num_fewshot == 3, "MBPP_OLMES requires exactly 3 fewshot examples"
        self.stop_sequences = ["```", '\n"""', "\nassert", "\n#"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        text = item["text"] if "text" in item else item["prompt"]
        test = item["test_list"][0]
        return (
            "Please provide a self-contained Python script that solves the following problem"
            f" in a markdown code block:\n```\n{text.strip()}\n{test}\n```\n"
        )

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Here is the completed function:\n\n```python\n"

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        return item["code"] + "\n"

    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
        return list(_OLMES_FEWSHOT_EXAMPLES)


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample) -> str:  # type: ignore[override]
        assert self.stop_sequences is not None

        for stop_seq in self.stop_sequences:
            if stop_seq in completion_text:
                completion_text = completion_text.split(stop_seq)[0]

        extracted_code = completion_text + "\n"
        mbpp_ground_truth = str(sample.ground_truth)
        code = self._code_expander(extracted_code, mbpp_ground_truth)
        return code