Source code for eval_framework.tasks.benchmarks.flores200

import os
import random
from pathlib import Path
from typing import Any

import pycountry
from datasets import DatasetDict, DownloadConfig, load_dataset
from huggingface_hub import HfApi
from huggingface_hub.errors import RevisionNotFoundError

from eval_framework.metrics.completion.bleu import BLEU
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType

FLORES_LANGUAGES = [
    "deu_Latn",
    "eng_Latn",
    "fin_Latn",
    "fra_Latn",
    "nld_Latn",
]  # Note: there are many more languages in the dataset, but we only consider these for now


[docs] class Flores200(BaseTask[str]): """FLORES-200 dataset: https://huggingface.co/datasets/facebook/flores""" NAME = "FLoRes-200" DATASET_PATH = "facebook/flores" HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce" SAMPLE_SPLIT = "devtest" FEWSHOT_SPLIT = "dev" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [BLEU] SUBJECTS = [f"{s}-{t}" for s in FLORES_LANGUAGES for t in FLORES_LANGUAGES if s != t] PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"] LANGUAGE = { "deu_Latn": Language.DEU, "eng_Latn": Language.ENG, "fin_Latn": Language.FIN, "fra_Latn": Language.FRA, "nld_Latn": Language.NLD, } def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.stop_sequences = ["\n"] def _load_hf_dataset_for_subject(self, subject: SubjectType) -> DatasetDict: """Load FLORES-200 parquet files for a specific language pair. The datasets library removed supports for loading scripts, so we load parquet files directly via hf:// URIs pinned to the specific revision. """ # Check if the HF_REVISION is valid before loading the dataset if self.HF_REVISION: try: _ = HfApi().dataset_info(repo_id=self.DATASET_PATH, revision=self.HF_REVISION, timeout=100.0) except Exception as e: if isinstance(e, RevisionNotFoundError): raise e cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets") download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5) # Reference for loading parquet files: https://huggingface.co/docs/datasets/en/loading#parquet base_uri = f"https://huggingface.co/datasets/{self.DATASET_PATH}/resolve/{self.HF_REVISION}/{subject}" data_files = { self.FEWSHOT_SPLIT: f"{base_uri}/{self.FEWSHOT_SPLIT}.parquet", self.SAMPLE_SPLIT: f"{base_uri}/{self.SAMPLE_SPLIT}.parquet", } return load_dataset( "parquet", data_files=data_files, cache_dir=cache_dir, download_config=download_config, ) def _load_dataset(self, subject: SubjectType) -> None: # Store the subject (language pair) for use in other methods self.subject = subject # Load parquet files for each subject hf_dataset = self._load_hf_dataset_for_subject(subject) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) for split, data in hf_dataset.items(): data_list = list(data) # Add the subject to each item so _get_instruction_text can use it for item in data_list: item["subject"] = subject if split == self.SAMPLE_SPLIT: self.rnd.shuffle(data_list) self.dataset[split] = data_list elif split == self.FEWSHOT_SPLIT: self.dataset[split] = data_list def _get_instruction_text(self, item: dict[str, Any]) -> str: source_key = item["subject"].split("-")[0] source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name source = item[f"sentence_{source_key}"] instruction = f"{source_language} sentence: {source}\n" target_key = item["subject"].split("-")[1] target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name return f"{instruction}{target_language} sentence:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: target_key = item["subject"].split("-")[1] return item[f"sentence_{target_key}"] def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: target = f" {self._get_ground_truth(item)}" assert target is not None assert isinstance(target, str) return target
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: return completion_text.strip()