Source code for eval_framework.tasks.benchmarks.naturalqs_open

from typing import Any

from eval_framework.metrics.completion.drop_completion import DropF1ExactMatch, DropMetricContext
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.task_style import (
    BPBStyle,
    ClozeStyle,
    MCStyle,
    answer_key_to_index,
)


[docs] class NaturalQsOpen(BaseTask[str]): NAME = "NaturalQsOpen" DATASET_PATH = "google-research-datasets/nq_open" SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [DropF1ExactMatch] SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.stop_sequences = ["Question:", "Q:", "\n\n"] self.max_tokens = 50 def _get_instruction_text(self, item: dict[str, Any]) -> str: return f"Question: {item.get('question', '')}\n" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> list[str]: return [f" {a}" for a in item.get("answer", [])] def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truths = self._get_ground_truth(item) assert ground_truths is not None # Extra processing step, since for this task, the ground truth can be a list of strings. # Following, OLMES, we join the targets with a comma. # TODO: Explore other ways (e.g. select a single target). The correct approach depends on the question. # E.g. "how many seasons of vampire diaries r there" [ "eight", "8" ] should perhaps select just one. # but "what are the three fifty shades of grey books" # [ "Fifty Shades of Grey", "Fifty Shades Darker", "Fifty Shades Freed" ] # would be better with joining. target = ",".join(ground_truths) # only comma, since the targets are already space-separated. assert isinstance(target, str) return f"{self._get_cue_text(item)}{target}" def _get_context(self, item: dict[str, Any]) -> DropMetricContext | None: # DROP metric expects list of lists of strings. answers = item.get("answer", []) if not answers: return None return DropMetricContext(answer_tuples=[[a] for a in answers])
class _NaturalQsOpenChoice_Base(BaseTask[str]): """Shared base for choice-based NaturalQsOpen variants (Cloze, MC, MC_OLMES).""" DATASET_PATH = "allenai/nq-gen2mc" SAMPLE_SPLIT = "validation" FEWSHOT_SPLIT = "validation" SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG def _get_raw_question(self, item: dict[str, Any]) -> str: return item.get("question", "") def _get_choices(self, item: dict[str, Any]) -> list[str]: return item.get("choices", {}).get("text", []) def _get_correct_index(self, item: dict[str, Any]) -> int: return answer_key_to_index(item.get("answerKey", ""))
[docs] class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base): NAME = "NaturalQsOpenCloze" TASK_STYLER = ClozeStyle()
[docs] class NaturalQsOpenMC(_NaturalQsOpenChoice_Base): NAME = "NaturalQsOpenMC" TASK_STYLER = MCStyle()
[docs] class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base): """NaturalQsOpenMC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...).""" NAME = "NaturalQsOpenMC_OLMES" TASK_STYLER = MCStyle(space_prefixed_labels=True)
[docs] class NaturalQsOpenBPB(_NaturalQsOpenChoice_Base): """BPB-only variant.""" NAME = "NaturalQsOpenBPB" TASK_STYLER = BPBStyle()