Source code for eval_framework.tasks.benchmarks.infinitebench

import os
import re
from abc import ABC
from pathlib import Path
from typing import Any

from datasets import DownloadConfig, Features, Sequence, Value, load_dataset

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import AccuracyLoglikelihood
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample



[docs]
class InfiniteBench(BaseTask[str], ABC):
    """
    InfiniteBench: Extending Long Context Evaluation Beyond 100K Tokens
    https://github.com/OpenBMB/InfiniteBench
    """

    DATASET_PATH = "xinrongzhang2022/InfiniteBench"
    SUBJECTS = ["default"]
    LANGUAGE = Language.ENG
    PERTURBATION_UNMODIFIABLE_WORDS = None

    def __init__(self, num_fewshot: int = 0) -> None:
        assert num_fewshot == 0, "Few-shots are not supported for long-context InfiniteBench tasks"
        super().__init__(num_fewshot)

    def _load_hf_dataset(self, **kwargs: Any) -> Any:
        cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
        ft = Features(
            {
                "id": Value("int64"),
                "context": Value("string"),
                "input": Value("string"),
                "answer": Sequence(Value("string")),
                "options": Sequence(Value("string")),
            }
        )
        try:
            return load_dataset(**kwargs, cache_dir=cache_dir, download_config=download_config, features=ft)
        except Exception:
            return load_dataset(
                **kwargs,
                cache_dir=f"{Path.home()}/.cache/eval-framework",
                features=ft,
            )




[docs]
class InfiniteBenchLoglikelihood(InfiniteBench, ABC):
    """Base class for loglikelihood tasks."""

    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [AccuracyLoglikelihood]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n\n{item['input']}"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
        assert item["answer"][0] in item["options"], f"Ground truth {item['answer']} is not in {item['options']}"
        return f" {item['answer'][0]}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return [f" {choice}" for choice in item["options"]]




[docs]
class InfiniteBenchCompletion(InfiniteBench, ABC):
    """Base class for completion tasks."""

    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n\n{item['input']}"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
        return item["answer"]




[docs]
class InfiniteBench_CodeDebug(InfiniteBenchLoglikelihood):
    """Finding which function in a code repo contains a crashing error (MC form)."""

    NAME = "InfiniteBench_CodeDebug"
    SAMPLE_SPLIT = "code_debug"
    FEWSHOT_SPLIT = SAMPLE_SPLIT




[docs]
class InfiniteBench_EnMC(InfiniteBenchLoglikelihood):
    """Multiple choice questions derived from the fake book."""

    NAME = "InfiniteBench_EnMC"
    SAMPLE_SPLIT = "longbook_choice_eng"
    FEWSHOT_SPLIT = SAMPLE_SPLIT




[docs]
class InfiniteBench_CodeRun(InfiniteBenchCompletion):
    """Simulating execution of multiple simple, synthetic functions."""

    NAME = "InfiniteBench_CodeRun"
    SAMPLE_SPLIT = "code_run"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 30  # Avg Output Tokens: 1.3


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]

        ANS_RE = re.compile(r"The return value is: (\-?[0-9\.\,]+)")
        match = ANS_RE.search(completion_text)
        if match:
            match_str = match.group(1).strip()
            return match_str
        else:
            return "[invalid]"





[docs]
class InfiniteBench_EnDia(InfiniteBenchCompletion):
    """Identification of talkers in partially anonymized scripts."""

    NAME = "InfiniteBench_EnDia"
    SAMPLE_SPLIT = "longdialogue_qa_eng"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 30  # Avg Output Tokens: 3.4

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
        answers = [i.lower() for i in item["answer"]]
        return answers

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n\n{item['input']}\n"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "The character which is $$MASK$$ is:"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]
        return completion_text.lower()





[docs]
class InfiniteBench_EnQA(InfiniteBenchCompletion):
    """Free-form question answering based on the fake book."""

    NAME = "InfiniteBench_EnQA"
    SAMPLE_SPLIT = "longbook_qa_eng"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 30  # Avg Output Tokens: 4.8

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n{item['input']}"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
        answers = [i.replace('"', "").lower() for i in item["answer"]]
        return answers


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]
        return completion_text.lower()





[docs]
class InfiniteBench_MathFind(InfiniteBenchCompletion):
    """Finding special integers in a lengthy list."""

    NAME = "InfiniteBench_MathFind"
    SAMPLE_SPLIT = "math_find"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 30  # Avg Output Tokens: 1.3


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]

        ANS_RE = re.compile(r"(\-?[0-9\.\,]+)")
        match = ANS_RE.search(completion_text)
        if match:
            match_str = match.group(0).strip()
            return match_str
        else:
            return "[invalid]"





[docs]
class InfiniteBench_RetrieveKV2(InfiniteBenchCompletion):
    """Finding the corresponding value from a dictionary and a key."""

    NAME = "InfiniteBench_RetrieveKV2"
    SAMPLE_SPLIT = "kv_retrieval"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 40  # Avg Output Tokens: 22.7 (all answers are 36 chars)

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n{item['input']}"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]

        ANS_RE = re.compile(r"([0-9a-f\-]+)")
        match = ANS_RE.search(completion_text)
        if match:
            match_str = match.group(1).strip()
            return match_str
        else:
            return "[invalid]"





[docs]
class InfiniteBench_RetrieveNumber(InfiniteBenchCompletion):
    """Locating repeated hidden numbers in a noisy long context."""

    NAME = "InfiniteBench_RetrieveNumber"
    SAMPLE_SPLIT = "number_string"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 12  # Avg Output Tokens: 4.0 (all answers are 10 digits integers)

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n{item['input']}\n"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "The sequence of digits is:"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]

        ANS_RE = re.compile(r"([0-9]+)")
        match = ANS_RE.search(completion_text)
        if match:
            match_str = match.group(1).strip()
            return match_str
        else:
            return "[invalid]"





[docs]
class InfiniteBench_RetrievePassKey1(InfiniteBenchCompletion):
    """Retrieving hidden keys in a noisy long context."""

    NAME = "InfiniteBench_RetrievePassKey1"
    SAMPLE_SPLIT = "passkey"
    FEWSHOT_SPLIT = SAMPLE_SPLIT

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)

        self.stop_sequences: list[str] = ["\n"]
        self.max_tokens = 8  # Avg Output Tokens: 2.0 (all answers are 5 digits integers)

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"{item['context']}\n{item['input']}\n"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "The pass key is:"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]

        ANS_RE = re.compile(r"([0-9]+)")
        match = ANS_RE.search(completion_text)
        if match:
            match_str = match.group(1).strip()
            return match_str
        else:
            return "[invalid]"