Source code for eval_framework.tasks.benchmarks.lab_bench

import random
from typing import Any

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters

LAB_BENCH_SUBSETS = ["CloningScenarios", "DbQA", "FigQA", "LitQA2", "ProtocolQA", "SeqQA", "SuppQA", "TableQA"]


[docs] class LabBenchCloze(BaseTask[str]): """Lab-Bench (futurehouse/lab-bench): QA over scientific protocols; cloze ranks ideal vs distractors.""" NAME = "LabBenchCloze" DATASET_PATH = "futurehouse/lab-bench" SAMPLE_SPLIT = "train" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, AccuracyCompletion, BitsPerByteLoglikelihood] SUBJECTS = LAB_BENCH_SUBSETS PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item.get("question", "") return f"Question: {question}\n" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: ideal = item.get("ideal") if ideal is None: return None return f" {ideal}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: choices = list(item.get("distractors", [])) + [item.get("ideal", "")] return [f" {c}" for c in choices]
[docs] class LabBenchMC(LabBenchCloze): NAME = "LabBenchMC" def _get_choices_order_keys(self, item: dict[str, Any]) -> tuple[list[str], list[int], list[str]]: """Return (choices, shuffle_order, keys) for consistent ordering across methods.""" choices = list(item.get("distractors", [])) + [item.get("ideal", "")] rng = random.Random(item.get("id", 0)) order = list(range(len(choices))) rng.shuffle(order) keys = get_n_letters(len(choices)) return choices, order, keys def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item.get("question", "") choices, order, keys = self._get_choices_order_keys(item) shuffled_choices = [choices[i] for i in order] options = "\n".join(f" {key}. {c}" for key, c in zip(keys, shuffled_choices)) return f"Question: {question}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: choices, order, keys = self._get_choices_order_keys(item) ideal_original_idx = len(choices) - 1 gold_idx = order.index(ideal_original_idx) return f" {keys[gold_idx]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: _, _, keys = self._get_choices_order_keys(item) return [f" {label}" for label in keys]
[docs] class LabBenchMC_OLMES(LabBenchMC): """ LabBenchMC with OLMES-style prompt: space before each label in the prompt (" A.", " B.", ...). """ NAME = "LabBenchMC_OLMES" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item.get("question", "") choices, order, keys = self._get_choices_order_keys(item) shuffled_choices = [choices[i] for i in order] options = "\n".join(f" {key}. {c}" for key, c in zip(keys, shuffled_choices)) return f"Question: {question}\n{options}\n"