Source code for eval_framework.tasks.benchmarks.naturalqs_open

from typing import Any

from eval_framework.metrics.completion.drop_completion import DropF1ExactMatch, DropMetricContext
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.task_style import (
    BPBStyle,
    ClozeStyle,
    MCStyle,
    answer_key_to_index,
)



[docs]
class NaturalQsOpen(BaseTask[str]):
    NAME = "NaturalQsOpen"
    DATASET_PATH = "google-research-datasets/nq_open"
    SAMPLE_SPLIT = "validation"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [DropF1ExactMatch]
    SUBJECTS = [NO_SUBJECT]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.stop_sequences = ["Question:", "Q:", "\n\n"]
        self.max_tokens = 50

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Question: {item.get('question', '')}\n"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"

    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
        return [f" {a}" for a in item.get("answer", [])]

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        ground_truths = self._get_ground_truth(item)
        assert ground_truths is not None

        # Extra processing step, since for this task, the ground truth can be a list of strings.
        # Following, OLMES, we join the targets with a comma.
        # TODO: Explore other ways (e.g. select a single target). The correct approach depends on the question.
        # E.g. "how many seasons of vampire diaries r there" [ "eight", "8" ] should perhaps select just one.
        # but "what are the three fifty shades of grey books"
        # [ "Fifty Shades of Grey", "Fifty Shades Darker", "Fifty Shades Freed" ]
        # would be better with joining.
        target = ",".join(ground_truths)  # only comma, since the targets are already space-separated.

        assert isinstance(target, str)
        return f"{self._get_cue_text(item)}{target}"

    def _get_context(self, item: dict[str, Any]) -> DropMetricContext | None:
        # DROP metric expects list of lists of strings.
        answers = item.get("answer", [])
        if not answers:
            return None
        return DropMetricContext(answer_tuples=[[a] for a in answers])



class _NaturalQsOpenChoice_Base(BaseTask[str]):
    """Shared base for choice-based NaturalQsOpen variants (Cloze, MC, MC_OLMES)."""

    DATASET_PATH = "allenai/nq-gen2mc"
    SAMPLE_SPLIT = "validation"
    FEWSHOT_SPLIT = "validation"
    SUBJECTS = [NO_SUBJECT]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
    LANGUAGE = Language.ENG

    def _get_raw_question(self, item: dict[str, Any]) -> str:
        return item.get("question", "")

    def _get_choices(self, item: dict[str, Any]) -> list[str]:
        return item.get("choices", {}).get("text", [])

    def _get_correct_index(self, item: dict[str, Any]) -> int:
        return answer_key_to_index(item.get("answerKey", ""))



[docs]
class NaturalQsOpenCloze(_NaturalQsOpenChoice_Base):
    NAME = "NaturalQsOpenCloze"
    TASK_STYLER = ClozeStyle()




[docs]
class NaturalQsOpenMC(_NaturalQsOpenChoice_Base):
    NAME = "NaturalQsOpenMC"
    TASK_STYLER = MCStyle()




[docs]
class NaturalQsOpenMC_OLMES(_NaturalQsOpenChoice_Base):
    """NaturalQsOpenMC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...)."""

    NAME = "NaturalQsOpenMC_OLMES"
    TASK_STYLER = MCStyle(space_prefixed_labels=True)




[docs]
class NaturalQsOpenBPB(_NaturalQsOpenChoice_Base):
    """BPB-only variant."""

    NAME = "NaturalQsOpenBPB"
    TASK_STYLER = BPBStyle()