Source code for eval_framework.tasks.benchmarks.flores200
import os
import random
from pathlib import Path
from typing import Any
import pycountry
from datasets import DatasetDict, DownloadConfig, load_dataset
from huggingface_hub import HfApi
from huggingface_hub.errors import RevisionNotFoundError
from eval_framework.metrics.completion.bleu import BLEU
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
FLORES_LANGUAGES = [
"deu_Latn",
"eng_Latn",
"fin_Latn",
"fra_Latn",
"nld_Latn",
] # Note: there are many more languages in the dataset, but we only consider these for now
[docs]
class Flores200(BaseTask[str]):
"""FLORES-200 dataset: https://huggingface.co/datasets/facebook/flores"""
NAME = "FLoRes-200"
DATASET_PATH = "facebook/flores"
HF_REVISION = "fd7d8f42fccb9dbc35830053a8c705a2627124ce"
SAMPLE_SPLIT = "devtest"
FEWSHOT_SPLIT = "dev"
RESPONSE_TYPE = ResponseType.COMPLETION
METRICS = [BLEU]
SUBJECTS = [f"{s}-{t}" for s in FLORES_LANGUAGES for t in FLORES_LANGUAGES if s != t]
PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
LANGUAGE = {
"deu_Latn": Language.DEU,
"eng_Latn": Language.ENG,
"fin_Latn": Language.FIN,
"fra_Latn": Language.FRA,
"nld_Latn": Language.NLD,
}
def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
self.stop_sequences = ["\n"]
def _load_hf_dataset_for_subject(self, subject: SubjectType) -> DatasetDict:
"""Load FLORES-200 parquet files for a specific language pair.
The datasets library removed supports for loading scripts, so we load
parquet files directly via hf:// URIs pinned to the specific revision.
"""
# Check if the HF_REVISION is valid before loading the dataset
if self.HF_REVISION:
try:
_ = HfApi().dataset_info(repo_id=self.DATASET_PATH, revision=self.HF_REVISION, timeout=100.0)
except Exception as e:
if isinstance(e, RevisionNotFoundError):
raise e
cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
# Reference for loading parquet files: https://huggingface.co/docs/datasets/en/loading#parquet
base_uri = f"https://huggingface.co/datasets/{self.DATASET_PATH}/resolve/{self.HF_REVISION}/{subject}"
data_files = {
self.FEWSHOT_SPLIT: f"{base_uri}/{self.FEWSHOT_SPLIT}.parquet",
self.SAMPLE_SPLIT: f"{base_uri}/{self.SAMPLE_SPLIT}.parquet",
}
return load_dataset(
"parquet",
data_files=data_files,
cache_dir=cache_dir,
download_config=download_config,
)
def _load_dataset(self, subject: SubjectType) -> None:
# Store the subject (language pair) for use in other methods
self.subject = subject
# Load parquet files for each subject
hf_dataset = self._load_hf_dataset_for_subject(subject)
self.dataset = {}
self.rnd = random.Random(RANDOM_SEED)
for split, data in hf_dataset.items():
data_list = list(data)
# Add the subject to each item so _get_instruction_text can use it
for item in data_list:
item["subject"] = subject
if split == self.SAMPLE_SPLIT:
self.rnd.shuffle(data_list)
self.dataset[split] = data_list
elif split == self.FEWSHOT_SPLIT:
self.dataset[split] = data_list
def _get_instruction_text(self, item: dict[str, Any]) -> str:
source_key = item["subject"].split("-")[0]
source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
source = item[f"sentence_{source_key}"]
instruction = f"{source_language} sentence: {source}\n"
target_key = item["subject"].split("-")[1]
target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
return f"{instruction}{target_language} sentence:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
target_key = item["subject"].split("-")[1]
return item[f"sentence_{target_key}"]
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
target = f" {self._get_ground_truth(item)}"
assert target is not None
assert isinstance(target, str)
return target
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
return completion_text.strip()