Source code for eval_framework.tasks.benchmarks.lab_bench

import random
from typing import Any

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters

LAB_BENCH_SUBSETS = ["CloningScenarios", "DbQA", "FigQA", "LitQA2", "ProtocolQA", "SeqQA", "SuppQA", "TableQA"]



[docs]
class LabBenchCloze(BaseTask[str]):
    """Lab-Bench (futurehouse/lab-bench): QA over scientific protocols; cloze ranks ideal vs distractors."""

    NAME = "LabBenchCloze"
    DATASET_PATH = "futurehouse/lab-bench"
    SAMPLE_SPLIT = "train"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, AccuracyCompletion, BitsPerByteLoglikelihood]
    SUBJECTS = LAB_BENCH_SUBSETS
    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
    LANGUAGE = Language.ENG

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item.get("question", "")
        return f"Question: {question}\n"

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        ideal = item.get("ideal")
        if ideal is None:
            return None
        return f" {ideal}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        choices = list(item.get("distractors", [])) + [item.get("ideal", "")]
        return [f" {c}" for c in choices]




[docs]
class LabBenchMC(LabBenchCloze):
    NAME = "LabBenchMC"

    def _get_choices_order_keys(self, item: dict[str, Any]) -> tuple[list[str], list[int], list[str]]:
        """Return (choices, shuffle_order, keys) for consistent ordering across methods."""
        choices = list(item.get("distractors", [])) + [item.get("ideal", "")]
        rng = random.Random(item.get("id", 0))
        order = list(range(len(choices)))
        rng.shuffle(order)
        keys = get_n_letters(len(choices))
        return choices, order, keys

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item.get("question", "")
        choices, order, keys = self._get_choices_order_keys(item)
        shuffled_choices = [choices[i] for i in order]
        options = "\n".join(f" {key}. {c}" for key, c in zip(keys, shuffled_choices))
        return f"Question: {question}\n{options}\n"

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        choices, order, keys = self._get_choices_order_keys(item)
        ideal_original_idx = len(choices) - 1
        gold_idx = order.index(ideal_original_idx)
        return f" {keys[gold_idx]}"

    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
        _, _, keys = self._get_choices_order_keys(item)
        return [f" {label}" for label in keys]




[docs]
class LabBenchMC_OLMES(LabBenchMC):
    """
    LabBenchMC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...).
    """

    NAME = "LabBenchMC_OLMES"

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        question = item.get("question", "")
        choices, order, keys = self._get_choices_order_keys(item)
        shuffled_choices = [choices[i] for i in order]
        options = "\n".join(f" {key}. {c}" for key, c in zip(keys, shuffled_choices))
        return f"Question: {question}\n{options}\n"