Source code for eval_framework.tasks.benchmarks.arc
from typing import Any
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters
[docs]
class ARC(BaseTask[str]):
"""ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""
NAME = "ARC"
DATASET_PATH = "ai2_arc"
SAMPLE_SPLIT = "test"
FEWSHOT_SPLIT = "train"
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
SUBJECTS = ["ARC-Easy", "ARC-Challenge"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
LANGUAGE = Language.ENG
def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
self.keys = get_n_letters(5) # needs to be 5 because there is one sample with 5 answer possibilities
self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return f"Question: {item['question']}\n"
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
ground_truth = self._get_ground_truth(item)
assert ground_truth is not None
return f"{self._get_cue_text(item)}{ground_truth}"
def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Answer:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
return f" {item['choices']['text'][self.keys.index(answer_key)]}"
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
return [f" {choice}" for choice in item["choices"]["text"]]
[docs]
class ARC_IDK(ARC):
NAME = "ARC_IDK"
METRICS = [
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
ConfidenceWeightedAccuracy,
DistributionalCorrectnessScore,
TernaryScore,
]
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
return (
"Answer only if you are confident, since mistakes may be penalised, while correct answers receive points. "
"It is acceptable to answer with 'I do not know' if you are unsure, and you will receive 0 points."
)
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
completions = super()._get_possible_completions(item)
return (completions or []) + [" I do not know."]