Source code for eval_framework.tasks.benchmarks.belebele

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters


[docs] class BELEBELE(BaseTask[str]): """BELEBELE dataset: https://huggingface.co/datasets/facebook/belebele""" NAME = "BELEBELE" DATASET_PATH = "facebook/belebele" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "test" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = [ "eng_Latn", ] PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4) LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.keys = get_n_letters(4) self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)} def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: return "The following are multiple choice questions (with answers)." def _get_instruction_text(self, item: dict[str, Any]) -> str: context = item["flores_passage"].strip() question = item["question"].strip() choices = "".join( [ f"{key}. {choice}\n" for key, choice in zip( self.keys, [item["mc_answer1"], item["mc_answer2"], item["mc_answer3"], item["mc_answer4"]] ) ] ) return f"{context}\n\nQuestion: {question}\n{choices}" def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truth = self._get_ground_truth(item) assert ground_truth is not None return f"{self._get_cue_text(item)}{ground_truth}" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: return f" {self.keys[int(item['correct_answer_num']) - 1]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: return [f" {key}" for key in self.keys]