Source code for eval_framework.tasks.benchmarks.multipl_e

"""MultiPL-E: translations of HumanEval and MBPP into 6 programming languages.

Corresponds to the following OLMES task suites:

  multipl_e_humaneval:6lang::olmo3:n32:v2 (one task per language):
    multipl_e_humaneval:{cpp,java,js,php,rs,sh}::olmo3:n32:v2

  multipl_e_mbpp:6lang::olmo3:n32:v2 (one task per language):
    multipl_e_mbpp:{cpp,java,js,php,rs,sh}::olmo3:n32:v2

Recommended EvalConfig settings for full OLMES replication:
  repeats: 32
  llm_args: {sampling_params: {temperature: 0.6, top_p: 0.6}}
  max_tokens: 1024
  fewshot: 0

Paper: https://ieeexplore.ieee.org/abstract/document/10103177
"""

from typing import Any

from eval_framework.metrics.completion.multipl_e_assertion import MultiPLECodeAssertion, MultiPLEMetricContext
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType, Sample

MULTIPL_E_STOP_TOKENS: dict[str, list[str]] = {
    "cpp": ["\n}", "}\n//"],
    "java": ["\n    }\n", "}\n}", "}\n\n", "\n    public static void main", "\n    // Write"],
    "js": ["\nfunction ", "\n/*", "\n//", "\nconsole.log"],
    "php": ["\nfunction", "\n?>", "\n//", "\n#"],
    "rs": ["\n}"],
    "sh": ["}\n", "\n}"],
}


class _BaseMPLE(BaseTask[str]):
    """Abstract base for all MultiPL-E OLMES per-language tasks.

    Subclasses must define:
      - NAME (str): human-readable task name
      - MULTIPL_E_LANGUAGE (str): language code used in the HF dataset config
        and stop-token lookup (e.g. "cpp", "java", "js", "php", "rs", "sh")
      - MULTIPL_E_DATASET_PREFIX (str): HF dataset config prefix,
        either "humaneval" or "mbpp"
    """

    DATASET_PATH = "nuprl/MultiPL-E"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "test"  # no dedicated fewshot split; 0-shot is expected
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [MultiPLECodeAssertion]
    SUBJECTS = [NO_SUBJECT]
    LANGUAGE = Language.ENG

    MULTIPL_E_LANGUAGE: str  # overridden by each language subclass
    MULTIPL_E_DATASET_PREFIX: str  # "humaneval" or "mbpp"

    def __init__(self, num_fewshot: int = 0) -> None:
        assert num_fewshot == 0, (
            "MultiPL-E does not support few-shot prompting (there are no gold examples for MultiPL-E)."
        )
        super().__init__(num_fewshot)
        self.stop_sequences: list[str] = MULTIPL_E_STOP_TOKENS[self.MULTIPL_E_LANGUAGE]
        self.max_tokens: int = 1024

    def _load_dataset(self, subject: str) -> None:
        hf_dataset = self._load_hf_dataset(
            path=self.DATASET_PATH,
            name=f"{self.MULTIPL_E_DATASET_PREFIX}-{self.MULTIPL_E_LANGUAGE}",
        )
        self.dataset = self._shuffle_splits(hf_dataset)

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        # The prompt field already contains a complete function signature (and any leading
        # docstring / type annotations) in the target language. No additional formatting
        # is applied, matching the oe_eval behaviour (use_chat_format=False).
        return item["prompt"]

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        # Evaluation is entirely test-based; there is no single ground-truth string.
        return None

    def _get_context(self, item: dict[str, Any]) -> MultiPLEMetricContext:
        return MultiPLEMetricContext(
            prompt=item["prompt"],
            tests=item["tests"],
            language=item["language"],
        )

    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        """Apply language-specific stop sequences to trim the model's raw continuation."""
        for stop_seq in self.stop_sequences:
            if stop_seq in completion_text:
                completion_text = completion_text.split(stop_seq)[0]
        return completion_text


class _BaseMPLEHumanEval(_BaseMPLE):
    MULTIPL_E_DATASET_PREFIX = "humaneval"



[docs]
class MultiPLEHumanEvalCpp(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in C++ — OLMES variant (nuprl/MultiPL-E, humaneval-cpp, test split).

    Corresponds to ``multipl_e_humaneval:cpp::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval C++ OLMES"
    MULTIPL_E_LANGUAGE = "cpp"




[docs]
class MultiPLEHumanEvalJava(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in Java — OLMES variant (nuprl/MultiPL-E, humaneval-java, test split).

    Corresponds to ``multipl_e_humaneval:java::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval Java OLMES"
    MULTIPL_E_LANGUAGE = "java"




[docs]
class MultiPLEHumanEvalJs(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in JavaScript — OLMES variant (nuprl/MultiPL-E, humaneval-js, test split).

    Corresponds to ``multipl_e_humaneval:js::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval JS OLMES"
    MULTIPL_E_LANGUAGE = "js"




[docs]
class MultiPLEHumanEvalPhp(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in PHP — OLMES variant (nuprl/MultiPL-E, humaneval-php, test split).

    Corresponds to ``multipl_e_humaneval:php::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval PHP OLMES"
    MULTIPL_E_LANGUAGE = "php"




[docs]
class MultiPLEHumanEvalRs(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in Rust — OLMES variant (nuprl/MultiPL-E, humaneval-rs, test split).

    Corresponds to ``multipl_e_humaneval:rs::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval Rust OLMES"
    MULTIPL_E_LANGUAGE = "rs"




[docs]
class MultiPLEHumanEvalSh(_BaseMPLEHumanEval):
    """MultiPL-E HumanEval in Bash — OLMES variant (nuprl/MultiPL-E, humaneval-sh, test split).

    Corresponds to ``multipl_e_humaneval:sh::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E HumanEval Bash OLMES"
    MULTIPL_E_LANGUAGE = "sh"



class _BaseMPLEMBPP(_BaseMPLE):
    MULTIPL_E_DATASET_PREFIX = "mbpp"



[docs]
class MultiPLEMBPPCpp(_BaseMPLEMBPP):
    """MultiPL-E MBPP in C++ — OLMES variant (nuprl/MultiPL-E, mbpp-cpp, test split).

    Corresponds to ``multipl_e_mbpp:cpp::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP C++ OLMES"
    MULTIPL_E_LANGUAGE = "cpp"




[docs]
class MultiPLEMBPPJava(_BaseMPLEMBPP):
    """MultiPL-E MBPP in Java — OLMES variant (nuprl/MultiPL-E, mbpp-java, test split).

    Corresponds to ``multipl_e_mbpp:java::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP Java OLMES"
    MULTIPL_E_LANGUAGE = "java"




[docs]
class MultiPLEMBPPJs(_BaseMPLEMBPP):
    """MultiPL-E MBPP in JavaScript — OLMES variant (nuprl/MultiPL-E, mbpp-js, test split).

    Corresponds to ``multipl_e_mbpp:js::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP JS OLMES"
    MULTIPL_E_LANGUAGE = "js"




[docs]
class MultiPLEMBPPPhp(_BaseMPLEMBPP):
    """MultiPL-E MBPP in PHP — OLMES variant (nuprl/MultiPL-E, mbpp-php, test split).

    Corresponds to ``multipl_e_mbpp:php::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP PHP OLMES"
    MULTIPL_E_LANGUAGE = "php"




[docs]
class MultiPLEMBPPRs(_BaseMPLEMBPP):
    """MultiPL-E MBPP in Rust — OLMES variant (nuprl/MultiPL-E, mbpp-rs, test split).

    Corresponds to ``multipl_e_mbpp:rs::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP Rust OLMES"
    MULTIPL_E_LANGUAGE = "rs"




[docs]
class MultiPLEMBPPSh(_BaseMPLEMBPP):
    """MultiPL-E MBPP in Bash — OLMES variant (nuprl/MultiPL-E, mbpp-sh, test split).

    Corresponds to ``multipl_e_mbpp:sh::olmo3:n32:v2`` in oe_eval.
    Recommended: 0-shot, temp=0.6, top_p=0.6, repeats=32.
    """

    NAME = "MultiPL-E MBPP Bash OLMES"
    MULTIPL_E_LANGUAGE = "sh"