Source code for eval_framework.tasks.benchmarks.arc

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters



[docs]
class ARC(BaseTask[str]):
    """ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""

    NAME = "ARC"
    DATASET_PATH = "allenai/ai2_arc"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
    SUBJECTS = ["ARC-Easy", "ARC-Challenge"]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.keys = get_n_letters(5)  # needs to be 5 because there is one sample with 5 answer possibilities
        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Question: {item['question']}\n"

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        ground_truth = self._get_ground_truth(item)
        assert ground_truth is not None
        return f"{self._get_cue_text(item)}{ground_truth}"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
        return f" {item['choices']['text'][self.keys.index(answer_key)]}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return [f" {choice}" for choice in item["choices"]["text"]]




[docs]
class ARC_OLMES(ARC):
    """
    ARC with OLMES-style prompt: options shown with space-prefixed labels (" A.", " B.", ...);
    loglikelihood over " A"/" B"/ etc.
    """

    NAME = "ARC_OLMES"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item["question"]
        texts = item["choices"]["text"]
        labels = get_n_letters(len(texts))
        options = "\n".join(f" {label}. {t}" for label, t in zip(labels, texts))
        return f"Question: {question}\n{options}\n"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
        idx = self.keys.index(answer_key) if answer_key in self.keys else 0
        labels = get_n_letters(len(item["choices"]["text"]))
        return f" {labels[idx]}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        n = len(item["choices"]["text"])
        return [f" {label}" for label in get_n_letters(n)]




[docs]
class ARC_IDK(ARC):
    NAME = "ARC_IDK"
    METRICS = [
        AccuracyLoglikelihood,
        AccuracyNormLoglikelihood,
        ConfidenceWeightedAccuracy,
        DistributionalCorrectnessScore,
        TernaryScore,
    ]

    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
        return (
            "Answer only if you are confident, since mistakes may be penalised, while correct answers receive points. "
            "It is acceptable to answer with 'I do not know' if you are unsure, and you will receive 0 points."
        )

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        completions = super()._get_possible_completions(item)
        return (completions or []) + [" I do not know."]