Source code for eval_framework.tasks.benchmarks.csqa

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters



[docs]
class CommonsenseQACloze(BaseTask[str]):
    """CommonsenseQA dataset: https://huggingface.co/datasets/tau/commonsense_qa"""

    NAME = "CommonsenseQACloze"
    DATASET_PATH = "tau/commonsense_qa"
    SAMPLE_SPLIT = "validation"
    FEWSHOT_SPLIT = "validation"
    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]
    SUBJECTS = [NO_SUBJECT]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"]
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.keys = get_n_letters(5)

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Question: {item['question']}\n"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        correct_label = item["answerKey"]
        correct_index = self.keys.index(correct_label)
        return f" {self.keys[correct_index]}"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return [f" {choice}" for choice in item["choices"]["text"]]

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        ground_truth = self._get_ground_truth(item)
        assert ground_truth is not None
        return f"{self._get_cue_text(item)}{ground_truth}"




[docs]
class CommonsenseQAFullTextCloze(CommonsenseQACloze):
    """
    CommonsenseQA cloze with full answer text as ground truth (not just the letter).
    Scores loglikelihood over the full correct choice text; includes bits-per-byte.
    """

    NAME = "CommonsenseQAFullTextCloze"
    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood]

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        correct_label = item["answerKey"]
        correct_index = self.keys.index(correct_label)
        return f" {item['choices']['text'][correct_index]}"




[docs]
class CommonsenseQAMC(CommonsenseQACloze):
    """Multiple-choice variant of CommonsenseQA where the model selects a letter (A-E)."""

    NAME = "CommonsenseQAMC"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item["question"]
        texts = item["choices"]["text"]
        options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, texts))
        return f"Question: {question}\n{options}\n"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        correct_label = item["answerKey"]
        return f" {correct_label}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        labels = item["choices"]["label"]
        return [f" {label}" for label in labels]




[docs]
class CommonsenseQAMC_OLMES(CommonsenseQAMC):
    """
    CommonsenseQA MC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...).
    """

    NAME = "CommonsenseQAMC_OLMES"
    SAMPLE_SPLIT = "train"  # Use train split (largest) to best match OLMES, which evaluates all splits
    FEWSHOT_SPLIT = "train"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item["question"]
        texts = item["choices"]["text"]
        options = "\n".join(f" {key}. {choice}" for key, choice in zip(self.keys, texts))
        return f"Question: {question}\n{options}\n"