Source code for eval_framework.tasks.benchmarks.include

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters

INCLUDE_SUBJECTS = [
    "Albanian",
    "Arabic",
    "Armenian",
    "Azerbaijani",
    "Basque",
    "Belarusian",
    "Bengali",
    "Bulgarian",
    "Chinese",
    "Croatian",
    "Dutch",
    "Estonian",
    "Finnish",
    "French",
    "Georgian",
    "German",
    "Greek",
    "Hebrew",
    "Hindi",
    "Hungarian",
    "Indonesian",
    "Italian",
    "Japanese",
    "Kazakh",
    "Korean",
    "Lithuanian",
    "Malay",
    "Malayalam",
    "Nepali",
    "North Macedonian",
    "Persian",
    "Polish",
    "Portuguese",
    "Russian",
    "Serbian",
    "Spanish",
    "Tagalog",
    "Tamil",
    "Telugu",
    "Turkish",
    "Ukrainian",
    "Urdu",
    "Uzbek",
    "Vietnamese",
]


[docs] def subject_to_language(subject: str) -> Language: if subject == "Greek": return Language.ELL # type: ignore[attr-defined] elif subject == "Malay": return Language.MSA # type: ignore[attr-defined] elif subject == "Nepali": return Language.NEP # type: ignore[attr-defined] elif subject == "North Macedonian": return Language.MKD # type: ignore[attr-defined] elif subject == "Croatian": return Language.HRV # type: ignore[attr-defined] elif subject == "Serbian": return Language.SRP # type: ignore[attr-defined] else: return Language(subject)
[docs] class INCLUDE(BaseTask[str]): """INCLUDE dataset: https://huggingface.co/datasets/CohereLabs/include-base-44""" NAME = "INCLUDE" DATASET_PATH = "CohereLabs/include-base-44" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "validation" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = INCLUDE_SUBJECTS LANGUAGE = {lang: subject_to_language(lang) for lang in INCLUDE_SUBJECTS} def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.keys = get_n_letters(4) def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: return f"The following are multiple choice questions (with answers) in {item['language']}." # noqa: E501 def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"].strip() choices = "".join( [ f"{key}. {choice}\n" for key, choice in zip( self.keys, [item["option_a"], item["option_b"], item["option_c"], item["option_d"]] ) ] ) return f"Question: {question}\n{choices}" def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truth = self._get_ground_truth(item) assert ground_truth is not None return f"{self._get_cue_text(item)}{ground_truth}" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: return f" {self.keys[item['answer']]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {key}" for key in self.keys]