Source code for eval_framework.tasks.benchmarks.winox
from typing import Any
from eval_framework.tasks.base import Language
from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
[docs]
class WINOX(WINOGRANDE):
"""
Wino-X is a parallel dataset of German, French, and Russian Winograd schemas, aligned with their English
counterparts, used to examine whether neural machine translation models can perform coreference resolution that
requires commonsense knowledge, and whether multilingual language models are capable of commonsense reasoning
across multiple languages.
Winogrande: https://arxiv.org/abs/1907.10641
Wino-X: https://github.com/demelin/Wino-X
Wino-X: https://huggingface.co/datasets/demelin/wino_x
"""
DATASET_PATH = "demelin/wino_x"
SAMPLE_SPLIT = "test"
FEWSHOT_SPLIT = "test"
LANGUAGE_SHORT_CODE = ""
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
choices = self._extract_choices(item)
# in winogrande answer is a string but in wino_x it is an int
return f" {choices[ANSWER_STR_TO_NUM[str(item['answer'])]]}"
def _extract_question(self, item: dict) -> str:
question, _ = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
question = question.replace(" ", " ")
return question.strip()
def _extract_choices(self, item: dict) -> list[str]:
_, choice_suffix = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
choice_suffix = choice_suffix.replace(" ", " ")
choices = [
choice + choice_suffix
for choice in [item[f"option1_{self.LANGUAGE_SHORT_CODE}"], item[f"option2_{self.LANGUAGE_SHORT_CODE}"]]
]
return choices
[docs]
class WINOX_DE(WINOX):
NAME = "WINOX_DE"
SUBJECTS = ["lm_en_de"]
LANGUAGE = Language.DEU
LANGUAGE_SHORT_CODE = "de"
[docs]
class WINOX_FR(WINOX):
NAME = "WINOX_FR"
SUBJECTS = ["lm_en_fr"]
LANGUAGE = Language.FRA
LANGUAGE_SHORT_CODE = "fr"