Source code for eval_framework.tasks.benchmarks.winox

import os
from pathlib import Path
from typing import Any

from datasets import DownloadConfig, load_dataset
from huggingface_hub import HfApi
from huggingface_hub.errors import RevisionNotFoundError

from eval_framework.tasks.base import Language
from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE

ANSWER_STR_TO_NUM = {"1": 0, "2": 1}



[docs]
class WINOX(WINOGRANDE):
    """
    Wino-X is a parallel dataset of German, French, and Russian Winograd schemas, aligned with their English
    counterparts, used to examine whether neural machine translation models can perform coreference resolution that
    requires commonsense knowledge, and whether multilingual language models are capable of commonsense reasoning
    across multiple languages.

    Winogrande: https://arxiv.org/abs/1907.10641
    Wino-X: https://github.com/demelin/Wino-X
    Wino-X: https://huggingface.co/datasets/demelin/wino_x
    """

    DATASET_PATH = "demelin/wino_x"
    HF_REVISION = "7d82697fd52ac8b03e62aadfddc61077320f21e7"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "test"
    LANGUAGE_SHORT_CODE = ""

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        choices = self._extract_choices(item)
        # in winogrande answer is a string but in wino_x it is an int
        return f" {choices[ANSWER_STR_TO_NUM[str(item['answer'])]]}"

    def _extract_question(self, item: dict) -> str:
        question, _ = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
        question = question.replace("  ", " ")
        return question.strip()

    def _extract_choices(self, item: dict) -> list[str]:
        _, choice_suffix = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
        choice_suffix = choice_suffix.replace("  ", " ")
        choices = [
            choice + choice_suffix
            for choice in [item[f"option1_{self.LANGUAGE_SHORT_CODE}"], item[f"option2_{self.LANGUAGE_SHORT_CODE}"]]
        ]
        return choices

    def _load_hf_dataset(self, **kwargs: Any) -> Any:
        """Override to handle FLORES-200 encoding issues by using parquet files."""
        # Check if the HF_REVISION is valid before loading the dataset
        if self.HF_REVISION:
            try:
                _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
            except Exception as e:
                if isinstance(e, RevisionNotFoundError):
                    raise e

        cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)

        dataset = load_dataset(
            kwargs.get("path", self.DATASET_PATH),
            name=kwargs.get("name"),
            split=kwargs.get("split"),
            data_files=None,  # Let it auto-discover parquet files
            revision=self.HF_REVISION,
            cache_dir=cache_dir,
            download_config=download_config,
        )

        return dataset




[docs]
class WINOX_DE(WINOX):
    NAME = "WINOX_DE"
    SUBJECTS = ["lm_en_de"]
    LANGUAGE = Language.DEU
    LANGUAGE_SHORT_CODE = "de"




[docs]
class WINOX_FR(WINOX):
    NAME = "WINOX_FR"
    SUBJECTS = ["lm_en_fr"]
    LANGUAGE = Language.FRA
    LANGUAGE_SHORT_CODE = "fr"