Source code for eval_framework.tasks.benchmarks.copa

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters


[docs] class COPAEvalHarness(BaseTask[str]): """COPA dataset: https://huggingface.co/datasets/aps/super_glue This version uses samples from the validation split as evaluation examples (same as lm-eval-harness). """ NAME = "COPAEvalHarness" DATASET_PATH = "aps/super_glue" SAMPLE_SPLIT = "validation" # 100 examples (same split as lm-eval) FEWSHOT_SPLIT = "test" # 500 examples RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = ["copa"] PERTURBATION_UNMODIFIABLE_WORDS = ["because", "therefore"] LANGUAGE = Language.ENG def _get_instruction_text(self, item: dict[str, Any]) -> str: connector = { "cause": "because", "effect": "therefore", }[item["question"]] return item["premise"].strip()[:-1] + f" {connector} " def _get_ground_truth(self, item: dict[str, Any]) -> str | None: correct_choice = item["choice1"] if item["label"] == 0 else item["choice2"] return f"{self.convert_choice(correct_choice)}"
[docs] def convert_choice(self, choice: str) -> str: return choice[0].lower() + choice[1:]
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: choices = [self.convert_choice(item["choice1"]), self.convert_choice(item["choice2"])] return choices
[docs] class COPA_OLMES(COPAEvalHarness): """ COPA multiple choice (OLMES/oe_eval style): prompt shows premise + connector and options with space-prefixed labels (" A.", " B."); loglikelihood over " A"/" B". """ NAME = "COPA_OLMES" def _get_instruction_text(self, item: dict[str, Any]) -> str: connector = { "cause": "because", "effect": "therefore", }[item["question"]] premise = item["premise"].strip()[:-1] + f" {connector}" choices = [self.convert_choice(item["choice1"]), self.convert_choice(item["choice2"])] labels = get_n_letters(len(choices)) options = "\n".join(f" {label}. {choice}" for label, choice in zip(labels, choices)) return f"{premise}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: labels = get_n_letters(2) return f" {labels[item['label']]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {label}" for label in get_n_letters(2)]
[docs] class COPA(COPAEvalHarness): """ Unlike the original COPA task, this version uses the test split for evaluation and the validation split for few-shot examples. Previously, the test split labels were unavailable in the original dataset, but they are now accessible, allowing this configuration. """ NAME = "COPA" SAMPLE_SPLIT = "test" # 500 examples FEWSHOT_SPLIT = "validation" # 100 examples
[docs] class COPA_IDKEvalHarness(COPAEvalHarness): NAME = "COPA_IDKEvalHarness" METRICS = [ AccuracyLoglikelihood, AccuracyNormLoglikelihood, ConfidenceWeightedAccuracy, DistributionalCorrectnessScore, TernaryScore, ] def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: return ( "Complete the sentence only if you are confident, since mistakes may be penalised, while correct " "answers receive points. It is acceptable to answer with 'I do not know' if you are unsure, and " "you will receive 0 points." ) def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: completions = super()._get_possible_completions(item) return (completions or []) + [" I do not know."]
[docs] class COPA_IDK(COPA_IDKEvalHarness): NAME = "COPA_IDK" SAMPLE_SPLIT = "test" # 500 examples FEWSHOT_SPLIT = "validation" # 100 examples