Source code for eval_framework.tasks.benchmarks.csqa

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters


[docs] class CommonsenseQACloze(BaseTask[str]): """CommonsenseQA dataset: https://huggingface.co/datasets/tau/commonsense_qa""" NAME = "CommonsenseQACloze" DATASET_PATH = "tau/commonsense_qa" SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "validation" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.keys = get_n_letters(5) def _get_instruction_text(self, item: dict[str, Any]) -> str: return f"Question: {item['question']}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: correct_label = item["answerKey"] correct_index = self.keys.index(correct_label) return f" {self.keys[correct_index]}" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {choice}" for choice in item["choices"]["text"]] def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truth = self._get_ground_truth(item) assert ground_truth is not None return f"{self._get_cue_text(item)}{ground_truth}"
[docs] class CommonsenseQAFullTextCloze(CommonsenseQACloze): """ CommonsenseQA cloze with full answer text as ground truth (not just the letter). Scores loglikelihood over the full correct choice text; includes bits-per-byte. """ NAME = "CommonsenseQAFullTextCloze" METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] def _get_ground_truth(self, item: dict[str, Any]) -> str | None: correct_label = item["answerKey"] correct_index = self.keys.index(correct_label) return f" {item['choices']['text'][correct_index]}"
[docs] class CommonsenseQAMC(CommonsenseQACloze): """Multiple-choice variant of CommonsenseQA where the model selects a letter (A-E).""" NAME = "CommonsenseQAMC" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] texts = item["choices"]["text"] options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, texts)) return f"Question: {question}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: correct_label = item["answerKey"] return f" {correct_label}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: labels = item["choices"]["label"] return [f" {label}" for label in labels]
[docs] class CommonsenseQAMC_OLMES(CommonsenseQAMC): """ CommonsenseQA MC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...). """ NAME = "CommonsenseQAMC_OLMES" SAMPLE_SPLIT = "train" # Use train split (largest) to best match OLMES, which evaluates all splits FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] texts = item["choices"]["text"] options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, texts)) return f"Question: {question}\n{options}\n"