Source code for eval_framework.tasks.benchmarks.medqa

"""
MedQA (English): Open-domain medical question answering from medical exams.
"""

from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters


[docs] class MedQACloze(BaseTask[str]): """MedQA cloze (loglikelihood over choice text).""" NAME = "MedQACloze" DATASET_PATH = "davidheineman/medqa-en" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "dev" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, BitsPerByteLoglikelihood] SUBJECTS = [NO_SUBJECT] PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] LANGUAGE = Language.ENG def _get_instruction_text(self, item: dict[str, Any]) -> str: return f"Question: {item['question']}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: choices = item.get("choices", []) answer_idx = item.get("answer_idx") if answer_idx is None or not choices: return None return f" {choices[int(answer_idx)]}" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_possible_completions(self, item: dict[str, Any]) -> list[str]: choices = item.get("choices", []) return [f" {c}" for c in choices] def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truth = self._get_ground_truth(item) assert ground_truth is not None return f"{self._get_cue_text(item)}{ground_truth}"
[docs] class MedQAMC(MedQACloze): """MedQA multiple choice (loglikelihood over A/B/C/D/...).""" NAME = "MedQAMC" def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.keys = get_n_letters(5) def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] choices = item.get("choices", []) options = "\n".join(f"{label}. {choice}" for label, choice in zip(self.keys, choices)) return f"Question: {question}\n{options}\n" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: answer_idx = item.get("answer_idx") if answer_idx is None: return None return f" {self.keys[int(answer_idx)]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str]: return [f" {label}" for label in self.keys]
[docs] class MedQAMC_OLMES(MedQAMC): """ MedQA multiple choice with OLMES-style prompt: space before each label (" A.", " B.", ...). """ NAME = "MedQAMC_OLMES" FEWSHOT_SPLIT = "train" def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["question"] choices = item.get("choices", []) options = "\n".join(f" {label}. {choice}" for label, choice in zip(self.keys, choices)) return f"Question: {question}\n{options}\n"