Source code for eval_framework.tasks.benchmarks.gpqa

import hashlib
import logging
import random
import re
from typing import Any

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
from eval_framework.tasks.utils import get_n_letters

logger = logging.getLogger(__name__)



[docs]
class GPQA(BaseTask[str]):
    """GPQA dataset: https://huggingface.co/datasets/Idavidrein/gpqa"""

    NAME = "GPQA"
    DATASET_PATH = "Idavidrein/gpqa"
    SAMPLE_SPLIT = "train"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
    SUBJECTS = ["gpqa_extended"]  # ["gpqa_diamond", "gpqa_extended", "gpqa_main", "gpqa_experts"]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        super().__init__(num_fewshot)
        self.stop_sequences = ["Question:"]
        self.keys = get_n_letters(4)
        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
        self.rnd_choice_shuffle = random.Random(RANDOM_SEED)

    def _load_dataset(self, subject: SubjectType) -> None:
        name = subject if subject != NO_SUBJECT else None

        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
        self.dataset = {}

        self.rnd = random.Random(RANDOM_SEED)

        for split, data in hf_dataset.items():
            data_list = list(data)

            if split == self.SAMPLE_SPLIT:
                self.rnd.shuffle(data_list)

            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
                # exclude in the GPQA dataset one of the sample that has an too long prompt (DNA sequence)
                data_list_filtered = [
                    item
                    for item in data_list
                    if item["Question"]
                    != "Hello, you are embarking on a new project. You need to produce the HP1alpha protein in E. coli. Which of these plasmids will you choose?"  # noqa: E501
                ]
                if len(data_list) - len(data_list_filtered) > 0:
                    logger.info(f"Excluded {len(data_list) - len(data_list_filtered)} samples from {split} split.")
                assert len(data_list) - len(data_list_filtered) < 2, "we expect to remove max one item"

                self.dataset[split] = data_list_filtered

    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
        system_prompt_text = (
            "Here are some example questions from experts. "
            "An explanation is given before the final answer. "
            "Answer the final question yourself, giving your reasoning beforehand."
        )
        return system_prompt_text

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        choices, _ = self._get_possible_completions_marked(item)
        prompt = f"Question: {item['Question'].strip()}\n"
        prompt += "\n".join(choices) + "\n"
        return prompt

    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
        ground_truth = self._get_ground_truth(item)
        assert ground_truth is not None
        return f"{self._get_cue_text(item)}{ground_truth}"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        choices, correct_answer_position = self._get_possible_completions_marked(item)
        answer_key = choices[correct_answer_position][:3]
        return f" {answer_key}"

    def _get_possible_completions_marked(self, item: dict[str, Any]) -> tuple[list[str], int]:
        choices = [self._preprocess(item[f"Incorrect Answer {x}"]) for x in range(1, 4)]
        correct_answer = self._preprocess(item["Correct Answer"])
        # we want to be random, but always the same for the same input
        # so we hash the string, which always give you the same seed
        hash_object = hashlib.sha256(f"{choices} {correct_answer}".encode())
        self.rnd_choice_shuffle.seed(int(hash_object.hexdigest(), 16))
        self.rnd_choice_shuffle.shuffle(choices)
        correct_answer_position = self.rnd_choice_shuffle.randint(0, 3)
        choices.insert(correct_answer_position, correct_answer)
        choices = [f"({self.keys[i]}) {choice}" for i, choice in enumerate(choices)]
        return choices, correct_answer_position

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return [f" ({x})" for x in self.keys]

    @staticmethod
    def _preprocess(text: str | None) -> str:
        if text is None:
            return " "
        text = text.strip()
        text = text.replace(" [title]", ". ")
        text = re.sub("\\[.*?\\]", "", text)
        text = text.replace("  ", " ")
        return text




[docs]
class GPQA_OLMES(GPQA):
    """
    GPQA multiple choice (OLMES/oe_eval style): prompt shows options with space-prefixed labels
    (" A.", " B.", " C.", " D."); loglikelihood over " A"/" B"/" C"/" D".
    """

    NAME = "GPQA_OLMES"

    def _get_possible_completions_marked(self, item: dict[str, Any]) -> tuple[list[str], int]:
        choices = [self._preprocess(item[f"Incorrect Answer {x}"]) for x in range(1, 4)]
        correct_answer = self._preprocess(item["Correct Answer"])
        hash_object = hashlib.sha256(f"{choices} {correct_answer}".encode())
        self.rnd_choice_shuffle.seed(int(hash_object.hexdigest(), 16))
        self.rnd_choice_shuffle.shuffle(choices)
        correct_answer_position = self.rnd_choice_shuffle.randint(0, 3)
        choices.insert(correct_answer_position, correct_answer)
        choices = [f" {label}. {choice}" for label, choice in zip(self.keys, choices)]
        return choices, correct_answer_position

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        _, correct_answer_position = self._get_possible_completions_marked(item)
        return f" {self.keys[correct_answer_position]}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        return [f" {x}" for x in self.keys]




[docs]
class GPQA_IDK(GPQA):
    NAME = "GPQA_IDK"
    METRICS = [
        AccuracyLoglikelihood,
        AccuracyNormLoglikelihood,
        ConfidenceWeightedAccuracy,
        DistributionalCorrectnessScore,
        TernaryScore,
    ]

    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
        return (
            "Answer only if you are confident, since mistakes may be penalised, while correct answers receive points. "
            "It is acceptable to answer with '?' if you are unsure, and you will receive 0 points."
        )

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        completions = super()._get_possible_completions(item)
        return (completions or []) + [" (?)"]




[docs]
class GPQA_COT(GPQA):
    NAME = "GPQA_COT"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
        4
    )
    ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)")

    def __init__(self, num_fewshot: int = 0) -> None:
        assert num_fewshot == 0, "Fewshot is not supported for GPQA_COT"
        super().__init__(num_fewshot)
        self.stop_sequences: list[str] = ["Question:"]
        self.keys = get_n_letters(4)
        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
        self.rnd_choice_shuffle = random.Random(RANDOM_SEED)

    def _extract_answer(self, completion: str) -> str:
        match = self.ANS_RE.search(completion)
        if match:
            match_str = match.group(1)
            return match_str
        else:
            return "[invalid]"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        for stop_sequence in self.stop_sequences:
            if stop_sequence in completion_text:
                completion_text = completion_text.split(stop_sequence)[0]
        return self._extract_answer(completion_text)


    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
        return ""

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        # using the reasoning prompt from "Figure 44 of Tülu 3 paper: https://arxiv.org/pdf/2411.15124"
        choices, _ = self._get_possible_completions_marked(item)
        instruction_text = (
            "Answer the following multiple-choice question by giving the correct answer letter in parentheses. "
            "Provide CONCISE reasoning for the answer, and make sure to finish the response with "
            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
        )
        instruction_text += f"\n\nQuestion: {item['Question'].strip()}\n"
        instruction_text += "\n".join(choices)
        instruction_text += (
            "\n\nAnswer the above question and REMEMBER to finish your response with the exact phrase "
            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
        )
        return instruction_text

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return ""

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        choices, correct_answer_position = self._get_possible_completions_marked(item)
        # index 1 selects the letter
        answer_key = choices[correct_answer_position][1]
        return answer_key