Source code for eval_framework.tasks.benchmarks.quality

import random
from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType


[docs] class QUALITY(BaseTask[str]): NAME = "QuALITY" DATASET_PATH = "emozilla/quality" SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "validation" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = ["hard", "easy"] PERTURBATION_UNMODIFIABLE_WORDS = ["Article", "Question", "Answer"] LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "QuALITY only supports zero fewshot examples" super().__init__(num_fewshot) def _load_dataset(self, subject: SubjectType) -> None: hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) for split, data in hf_dataset.items(): if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]: continue data_list = [item for item in data if item["hard"] == (subject == "hard")] if split == self.SAMPLE_SPLIT: self.rnd.shuffle(data_list) self.dataset[split] = data_list def _get_instruction_text(self, item: dict[str, Any]) -> str: article = item["article"] question = item["question"] return f"Article: {article}\nQuestion: {question}\n" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: return f" {item['options'][item['answer']]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {option}" for option in item["options"]]