Source code for eval_framework.tasks.benchmarks.winogrande

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.base import BaseTask, Language, ResponseType

ANSWER_STR_TO_NUM = {"1": 0, "2": 1}


[docs] class WINOGRANDE(BaseTask[str]): """WINOGRANDE dataset: https://huggingface.co/datasets/winogrande""" NAME = "Winogrande" DATASET_PATH = "winogrande" SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = ["winogrande_xl"] PERTURBATION_UNMODIFIABLE_WORDS = ["1", "2"] LANGUAGE = Language.ENG def _extract_question(self, item: dict) -> str: question, _ = item["sentence"].split("_") question = question.replace(" ", " ") return question.strip() def _extract_choices(self, item: dict) -> list[str]: _, choice_suffix = item["sentence"].split("_") choice_suffix = choice_suffix.replace(" ", " ") choices = [choice + choice_suffix for choice in [item["option1"], item["option2"]]] return choices def _get_instruction_text(self, item: dict[str, Any]) -> str: return f"{self._extract_question(item)}" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: choices = self._extract_choices(item) return f" {choices[ANSWER_STR_TO_NUM[item['answer']]]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {choice}" for choice in self._extract_choices(item)]
[docs] class WINOGRANDE_IDK(WINOGRANDE): NAME = "Winogrande_IDK" METRICS = [ AccuracyLoglikelihood, AccuracyNormLoglikelihood, ConfidenceWeightedAccuracy, DistributionalCorrectnessScore, TernaryScore, ] def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: return ( "Complete the sentence only if you are confident, since mistakes may be penalised, while correct " "answers receive points. It is acceptable to answer with 'I do not know' if you are unsure, and " "you will receive 0 points." ) def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."]