import random
from itertools import product
from typing import Any
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType
from eval_framework.tasks.benchmarks.mmlu import MMLU_SUBJECTS
from eval_framework.tasks.utils import get_n_letters
GLOBAL_MMLU_LANGUAGES = ["fr", "de", "es", "it", "pt", "ar"]
GLOBAL_MMLU_LANGUAGES_UNSUPPORTED = [
"am",
"ar",
"bn",
"cs",
"el",
"en",
"fil",
"fr",
"ha",
"he",
"hi",
"ig",
"id",
"it",
"ja",
"ky",
"ko",
"lt",
"mg",
"ms",
"ne",
"nl",
"ny",
"fa",
"pl",
"pt",
"ro",
"ru",
"si",
"sn",
"so",
"es",
"sr",
"sw",
"sv",
"te",
"tr",
"uk",
"vi",
"yo",
"zh",
]
MMLU_SUBJECTS_DE = {
"abstract_algebra": "Abstrakte Algebra",
"anatomy": "Anatomie",
"astronomy": "Astronomie",
"business_ethics": "Wirtschaftsethik",
"clinical_knowledge": "Klinisches Wissen",
"college_biology": "Biologie (Universität)",
"college_chemistry": "Chemie (Universität)",
"college_computer_science": "Informatik (Universität)",
"college_mathematics": "Mathematik (Universität)",
"college_medicine": "Medizin (Universität)",
"college_physics": "Physik (Universität)",
"computer_security": "IT-Sicherheit",
"conceptual_physics": "Konzeptuelle Physik",
"econometrics": "Ökonometrie",
"electrical_engineering": "Elektrotechnik",
"elementary_mathematics": "Elementarmathematik",
"formal_logic": "Formale Logik",
"global_facts": "Weltwissen",
"high_school_biology": "Biologie (Gymnasium)",
"high_school_chemistry": "Chemie (Gymnasium)",
"high_school_computer_science": "Informatik (Gymnasium)",
"high_school_european_history": "Europäische Geschichte (Gymnasium)",
"high_school_geography": "Geografie (Gymnasium)",
"high_school_government_and_politics": "Politik und Regierung (Gymnasium)",
"high_school_macroeconomics": "Makroökonomie (Gymnasium)",
"high_school_mathematics": "Mathematik (Gymnasium)",
"high_school_microeconomics": "Mikroökonomie (Gymnasium)",
"high_school_physics": "Physik (Gymnasium)",
"high_school_psychology": "Psychologie (Gymnasium)",
"high_school_statistics": "Statistik (Gymnasium)",
"high_school_us_history": "US-Geschichte (Gymnasium)",
"high_school_world_history": "Weltgeschichte (Gymnasium)",
"human_aging": "Altern des Menschen",
"human_sexuality": "Menschliche Sexualität",
"international_law": "Völkerrecht",
"jurisprudence": "Rechtswissenschaft",
"logical_fallacies": "Logische Fehlschlüsse",
"machine_learning": "Maschinelles Lernen",
"management": "Management",
"marketing": "Marketing",
"medical_genetics": "Medizinische Genetik",
"miscellaneous": "Verschiedenes",
"moral_disputes": "Moralische Streitfragen",
"moral_scenarios": "Moralische Szenarien",
"nutrition": "Ernährung",
"philosophy": "Philosophie",
"prehistory": "Urgeschichte",
"professional_accounting": "Berufsbezogene Buchhaltung",
"professional_law": "Berufsbezogenes Recht",
"professional_medicine": "Berufsbezogene Medizin",
"professional_psychology": "Berufsbezogene Psychologie",
"public_relations": "Öffentlichkeitsarbeit",
"security_studies": "Sicherheitsstudien",
"sociology": "Soziologie",
"us_foreign_policy": "US-Außenpolitik",
"virology": "Virologie",
"world_religions": "Weltreligionen",
}
MMLU_SUBJECTS_FR = {
"abstract_algebra": "Algèbre Abstraite",
"anatomy": "Anatomie",
"astronomy": "Astronomie",
"business_ethics": "Éthique des Affaires",
"clinical_knowledge": "Connaissances Cliniques",
"college_biology": "Biologie Universitaire",
"college_chemistry": "Chimie Universitaire",
"college_computer_science": "Informatique Universitaire",
"college_mathematics": "Mathématiques Universitaires",
"college_medicine": "Médecine Universitaire",
"college_physics": "Physique Universitaire",
"computer_security": "Sécurité Informatique",
"conceptual_physics": "Physique Conceptuelle",
"econometrics": "Économétrie",
"electrical_engineering": "Génie Électrique",
"elementary_mathematics": "Mathématiques Élémentaires",
"formal_logic": "Logique Formelle",
"global_facts": "Faits Mondiaux",
"high_school_biology": "Biologie au Lycée",
"high_school_chemistry": "Chimie au Lycée",
"high_school_computer_science": "Informatique au Lycée",
"high_school_european_history": "Histoire Européenne au Lycée",
"high_school_geography": "Géographie au Lycée",
"high_school_government_and_politics": "Gouvernement et Politique au Lycée",
"high_school_macroeconomics": "Macroéconomie au Lycée",
"high_school_mathematics": "Mathématiques au Lycée",
"high_school_microeconomics": "Microéconomie au Lycée",
"high_school_physics": "Physique au Lycée",
"high_school_psychology": "Psychologie au Lycée",
"high_school_statistics": "Statistiques au Lycée",
"high_school_us_history": "Histoire des États-Unis au Lycée",
"high_school_world_history": "Histoire Mondiale au Lycée",
"human_aging": "Vieillissement Humain",
"human_sexuality": "Sexualité Humaine",
"international_law": "Droit International",
"jurisprudence": "Jurisprudence",
"logical_fallacies": "Fautes de Logique",
"machine_learning": "Apprentissage Automatique",
"management": "Gestion",
"marketing": "Marketing",
"medical_genetics": "Génétique Médicale",
"miscellaneous": "Divers",
"moral_disputes": "Conflits Moraux",
"moral_scenarios": "Scénarios Moraux",
"nutrition": "Nutrition",
"philosophy": "Philosophie",
"prehistory": "Préhistoire",
"professional_accounting": "Comptabilité Professionnelle",
"professional_law": "Droit Professionnel",
"professional_medicine": "Médecine Professionnelle",
"professional_psychology": "Psychologie Professionnelle",
"public_relations": "Relations Publiques",
"security_studies": "Études de Sécurité",
"sociology": "Sociologie",
"us_foreign_policy": "Politique Étrangère des États-Unis",
"virology": "Virologie",
"world_religions": "Religions du Monde",
}
MMLU_SUBJECTS_ES = {
"abstract_algebra": "Álgebra Abstracta",
"anatomy": "Anatomía",
"astronomy": "Astronomía",
"business_ethics": "Ética Empresarial",
"clinical_knowledge": "Conocimientos Clínicos",
"college_biology": "Biología Universitaria",
"college_chemistry": "Química Universitaria",
"college_computer_science": "Informática Universitaria",
"college_mathematics": "Matemáticas Universitarias",
"college_medicine": "Medicina Universitaria",
"college_physics": "Física Universitaria",
"computer_security": "Seguridad Informática",
"conceptual_physics": "Física Conceptual",
"econometrics": "Econometría",
"electrical_engineering": "Ingeniería Eléctrica",
"elementary_mathematics": "Matemáticas Elementales",
"formal_logic": "Lógica Formal",
"global_facts": "Datos Globales",
"high_school_biology": "Biología de Secundaria",
"high_school_chemistry": "Química de Secundaria",
"high_school_computer_science": "Informática de Secundaria",
"high_school_european_history": "Historia Europea de Secundaria",
"high_school_geography": "Geografía de Secundaria",
"high_school_government_and_politics": "Gobierno y Política de Secundaria",
"high_school_macroeconomics": "Macroeconomía de Secundaria",
"high_school_mathematics": "Matemáticas de Secundaria",
"high_school_microeconomics": "Microeconomía de Secundaria",
"high_school_physics": "Física de Secundaria",
"high_school_psychology": "Psicología de Secundaria",
"high_school_statistics": "Estadística de Secundaria",
"high_school_us_history": "Historia de EE. UU. de Secundaria",
"high_school_world_history": "Historia Mundial de Secundaria",
"human_aging": "Envejecimiento Humano",
"human_sexuality": "Sexualidad Humana",
"international_law": "Derecho Internacional",
"jurisprudence": "Jurisprudencia",
"logical_fallacies": "Falacias Lógicas",
"machine_learning": "Aprendizaje Automático",
"management": "Administración",
"marketing": "Mercadotecnia",
"medical_genetics": "Genética Médica",
"miscellaneous": "Misceláneos",
"moral_disputes": "Disputas Morales",
"moral_scenarios": "Escenarios Morales",
"nutrition": "Nutrición",
"philosophy": "Filosofía",
"prehistory": "Prehistoria",
"professional_accounting": "Contabilidad Profesional",
"professional_law": "Derecho Profesional",
"professional_medicine": "Medicina Profesional",
"professional_psychology": "Psicología Profesional",
"public_relations": "Relaciones Públicas",
"security_studies": "Estudios de Seguridad",
"sociology": "Sociología",
"us_foreign_policy": "Política Exterior de EE. UU.",
"virology": "Virología",
"world_religions": "Religiones del Mundo",
}
MMLU_SUBJECTS_IT = {
"abstract_algebra": "Algebra Astratta",
"anatomy": "Anatomia",
"astronomy": "Astronomia",
"business_ethics": "Etica Aziendale",
"clinical_knowledge": "Conoscenza Clinica",
"college_biology": "Biologia Universitaria",
"college_chemistry": "Chimica Universitaria",
"college_computer_science": "Informatica Universitaria",
"college_mathematics": "Matematica Universitaria",
"college_medicine": "Medicina Universitaria",
"college_physics": "Fisica Universitaria",
"computer_security": "Sicurezza Informatica",
"conceptual_physics": "Fisica Concettuale",
"econometrics": "Econometria",
"electrical_engineering": "Ingegneria Elettrica",
"elementary_mathematics": "Matematica Elementare",
"formal_logic": "Logica Formale",
"global_facts": "Fatti Globali",
"high_school_biology": "Biologia Liceale",
"high_school_chemistry": "Chimica Liceale",
"high_school_computer_science": "Informatica Liceale",
"high_school_european_history": "Storia Europea Liceale",
"high_school_geography": "Geografia Liceale",
"high_school_government_and_politics": "Governo e Politica Liceale",
"high_school_macroeconomics": "Macroeconomia Liceale",
"high_school_mathematics": "Matematica Liceale",
"high_school_microeconomics": "Microeconomia Liceale",
"high_school_physics": "Fisica Liceale",
"high_school_psychology": "Psicologia Liceale",
"high_school_statistics": "Statistica Liceale",
"high_school_us_history": "Storia Americana Liceale",
"high_school_world_history": "Storia Mondiale Liceale",
"human_aging": "Invecchiamento Umano",
"human_sexuality": "Sessualità Umana",
"international_law": "Diritto Internazionale",
"jurisprudence": "Giurisprudenza",
"logical_fallacies": "Fallacie Logiche",
"machine_learning": "Apprendimento Automatico",
"management": "Gestione",
"marketing": "Marketing",
"medical_genetics": "Genetica Medica",
"miscellaneous": "Varie",
"moral_disputes": "Controversie Morali",
"moral_scenarios": "Scenari Morali",
"nutrition": "Nutrizione",
"philosophy": "Filosofia",
"prehistory": "Preistoria",
"professional_accounting": "Contabilità Professionale",
"professional_law": "Diritto Professionale",
"professional_medicine": "Medicina Professionale",
"professional_psychology": "Psicologia Professionale",
"public_relations": "Relazioni Pubbliche",
"security_studies": "Studi sulla Sicurezza",
"sociology": "Sociologia",
"us_foreign_policy": "Politica Estera degli Stati Uniti",
"virology": "Virologia",
"world_religions": "Religioni del Mondo",
}
MMLU_SUBJECTS_PT = {
"abstract_algebra": "Álgebra Abstrata",
"anatomy": "Anatomia",
"astronomy": "Astronomia",
"business_ethics": "Ética Empresarial",
"clinical_knowledge": "Conhecimento Clínico",
"college_biology": "Biologia Universitária",
"college_chemistry": "Química Universitária",
"college_computer_science": "Ciência da Computação Universitária",
"college_mathematics": "Matemática Universitária",
"college_medicine": "Medicina Universitária",
"college_physics": "Física Universitária",
"computer_security": "Segurança da Computação",
"conceptual_physics": "Física Conceitual",
"econometrics": "Econometria",
"electrical_engineering": "Engenharia Elétrica",
"elementary_mathematics": "Matemática Elementar",
"formal_logic": "Lógica Formal",
"global_facts": "Fatos Globais",
"high_school_biology": "Biologia do Ensino Médio",
"high_school_chemistry": "Química do Ensino Médio",
"high_school_computer_science": "Ciência da Computação do Ensino Médio",
"high_school_european_history": "História Europeia do Ensino Médio",
"high_school_geography": "Geografia do Ensino Médio",
"high_school_government_and_politics": "Governo e Política do Ensino Médio",
"high_school_macroeconomics": "Macroeconomia do Ensino Médio",
"high_school_mathematics": "Matemática do Ensino Médio",
"high_school_microeconomics": "Microeconomia do Ensino Médio",
"high_school_physics": "Física do Ensino Médio",
"high_school_psychology": "Psicologia do Ensino Médio",
"high_school_statistics": "Estatística do Ensino Médio",
"high_school_us_history": "História dos EUA do Ensino Médio",
"high_school_world_history": "História Mundial do Ensino Médio",
"human_aging": "Envelhecimento Humano",
"human_sexuality": "Sexualidade Humana",
"international_law": "Direito Internacional",
"jurisprudence": "Jurisprudência",
"logical_fallacies": "Falácias Lógicas",
"machine_learning": "Aprendizado de Máquina",
"management": "Administração",
"marketing": "Marketing",
"medical_genetics": "Genética Médica",
"miscellaneous": "Diversos",
"moral_disputes": "Disputas Morais",
"moral_scenarios": "Cenários Morais",
"nutrition": "Nutrição",
"philosophy": "Filosofia",
"prehistory": "Pré-História",
"professional_accounting": "Contabilidade Profissional",
"professional_law": "Direito Profissional",
"professional_medicine": "Medicina Profissional",
"professional_psychology": "Psicologia Profissional",
"public_relations": "Relações Públicas",
"security_studies": "Estudos de Segurança",
"sociology": "Sociologia",
"us_foreign_policy": "Política Externa dos EUA",
"virology": "Virologia",
"world_religions": "Religiões Mundiais",
}
MMLU_SUBJECTS_AR = {
"abstract_algebra": "الجبر المجرد",
"anatomy": "علم التشريح",
"astronomy": "علم الفلك",
"business_ethics": "أخلاقيات الأعمال",
"clinical_knowledge": "المعرفة السريرية",
"college_biology": "أحياء جامعية",
"college_chemistry": "كيمياء جامعية",
"college_computer_science": "علوم الحاسوب الجامعية",
"college_mathematics": "رياضيات جامعية",
"college_medicine": "طب جامعي",
"college_physics": "فيزياء جامعية",
"computer_security": "أمن الحاسوب",
"conceptual_physics": "الفيزياء المفاهيمية",
"econometrics": "الاقتصاد القياسي",
"electrical_engineering": "الهندسة الكهربائية",
"elementary_mathematics": "الرياضيات الابتدائية",
"formal_logic": "المنطق الصوري",
"global_facts": "حقائق عالمية",
"high_school_biology": "أحياء ثانوية",
"high_school_chemistry": "كيمياء ثانوية",
"high_school_computer_science": "علوم الحاسوب الثانوية",
"high_school_european_history": "تاريخ أوروبا الثانوي",
"high_school_geography": "جغرافيا ثانوية",
"high_school_government_and_politics": "الحكومة والسياسة الثانوية",
"high_school_macroeconomics": "الاقتصاد الكلي الثانوي",
"high_school_mathematics": "رياضيات ثانوية",
"high_school_microeconomics": "الاقتصاد الجزئي الثانوي",
"high_school_physics": "فيزياء ثانوية",
"high_school_psychology": "علم النفس الثانوي",
"high_school_statistics": "الإحصاء الثانوي",
"high_school_us_history": "تاريخ الولايات المتحدة الثانوي",
"high_school_world_history": "تاريخ العالم الثانوي",
"human_aging": "شيخوخة الإنسان",
"human_sexuality": "الجنس البشري",
"international_law": "القانون الدولي",
"jurisprudence": "الفقه القانوني",
"logical_fallacies": "المغالطات المنطقية",
"machine_learning": "تعلم الآلة",
"management": "الإدارة",
"marketing": "التسويق",
"medical_genetics": "الوراثة الطبية",
"miscellaneous": "متفرقات",
"moral_disputes": "الخلافات الأخلاقية",
"moral_scenarios": "السيناريوهات الأخلاقية",
"nutrition": "التغذية",
"philosophy": "الفلسفة",
"prehistory": "ما قبل التاريخ",
"professional_accounting": "المحاسبة المهنية",
"professional_law": "القانون المهني",
"professional_medicine": "الطب المهني",
"professional_psychology": "علم النفس المهني",
"public_relations": "العلاقات العامة",
"security_studies": "دراسات الأمن",
"sociology": "علم الاجتماع",
"us_foreign_policy": "السياسة الخارجية الأمريكية",
"virology": "علم الفيروسات",
"world_religions": "الديانات العالمية",
}
LANGUAGE_SUBJECTS_MAP = {
"fr": MMLU_SUBJECTS_FR,
"de": MMLU_SUBJECTS_DE,
"es": MMLU_SUBJECTS_ES,
"it": MMLU_SUBJECTS_IT,
"pt": MMLU_SUBJECTS_PT,
"ar": MMLU_SUBJECTS_AR,
}
LANGUAGE_INITIAL_PROMPT_TEXT_MAP = {
"fr": "Les questions suivantes sont des questions à choix multiples (avec réponses) sur",
"de": "Die folgenden sind Multiple-Choice-Fragen (mit Antworten) über",
"es": "Las siguientes son preguntas de opción múltiple (con respuestas) sobre",
"it": "Le seguenti sono domande a scelta multipla (con risposte) su",
"pt": "As seguintes são perguntas de múltipla escolha (com respostas) sobre",
"ar": "فيما يلي أسئلة اختيار من متعدد (مع الإجابات) حول",
}
LANGUAGE_QUESTION_TEXT_MAP = {
"fr": "Question",
"de": "Frage",
"es": "Pregunta",
"it": "Domanda",
"pt": "Pergunta",
"ar": "السؤال",
}
LANGUAGE_ANSWER_TEXT_MAP = {
"fr": "Réponse",
"de": "Antwort",
"es": "Respuesta",
"it": "Risposta",
"pt": "Resposta",
"ar": "الإجابة",
}
LANGUAGE_NAME_MAP = {
"fr": Language.FRA,
"de": Language.DEU,
"es": Language.SPA,
"it": Language.ITA,
"pt": Language.POR,
"ar": Language.ARB,
}
[docs]
class GlobalMMLU(BaseTask[tuple[str, str]]):
"""
MMLU dataset: https://huggingface.co/datasets/CohereLabs/Global-MMLU
Currently, we only support prompting in French, German, Spanish, Italian,
Portugese, and Arabic.
TO-DO: Suggest we adjust prompting for languages individually, e.g., South-East Asian languages available here:
https://github.com/aisingapore/SEA-HELM/blob/main/seahelm_tasks/knowledge/global_mmlu/abstract_algebra/config.yaml
"""
NAME = "GlobalMMLU"
DATASET_PATH = "CohereLabs/Global-MMLU"
SAMPLE_SPLIT = "test"
FEWSHOT_SPLIT = "dev"
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
SUBJECTS = list(product(GLOBAL_MMLU_LANGUAGES, MMLU_SUBJECTS))
PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
LANGUAGE = {
str((lang_code.split("_")[0], subject)): LANGUAGE_NAME_MAP[lang_code]
for lang_code, subjects in LANGUAGE_SUBJECTS_MAP.items()
for subject in subjects
}
def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
self.keys = get_n_letters(4)
def _load_dataset(self, subject: tuple[str, str]) -> None:
lang, current_subject = subject
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=lang)
self.dataset = {}
self.rnd = random.Random(RANDOM_SEED)
for split, data in hf_dataset.items():
data = data.filter(lambda x: x["subject"] == current_subject)
data_list = list(data)
if split == self.SAMPLE_SPLIT:
self.rnd.shuffle(data_list)
if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
self.dataset[split] = data_list
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
language_key = item["subject"][0]
subject = LANGUAGE_SUBJECTS_MAP[language_key][item["subject"][1]]
return f"{LANGUAGE_INITIAL_PROMPT_TEXT_MAP[language_key]} {subject}."
OPTION_KEYS = {"A": "option_a", "B": "option_b", "C": "option_c", "D": "option_d"}
def _get_instruction_text(self, item: dict[str, Any]) -> str:
question = item["question"].strip()
choices = "".join([f"{key}. {item[self.OPTION_KEYS[key]]}\n" for key in self.keys])
language_key = item["subject"][0]
return f"{LANGUAGE_QUESTION_TEXT_MAP[language_key]}: {question}\n{choices}"
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
ground_truth = self._get_ground_truth(item)
assert ground_truth is not None
return f"{self._get_cue_text(item)}{ground_truth}"
def _get_cue_text(self, item: dict[str, Any]) -> str:
language_key = item["subject"][0]
return f"{LANGUAGE_ANSWER_TEXT_MAP[language_key]}:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return f" {item['answer']}"
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
return [f" {key}" for key in self.keys]