Source code for eval_framework.tasks.benchmarks.chembench

import json
from typing import Any

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters

CHEMBENCH_SUBJECTS = [
    "analytical_chemistry",
    "chemical_preference",
    "general_chemistry",
    "inorganic_chemistry",
    "materials_science",
    "organic_chemistry",
    "physical_chemistry",
    "technical_chemistry",
    "toxicity_and_safety",
]


[docs] class ChemBench(BaseTask[str]): """ChemBench dataset: https://huggingface.co/datasets/jablonkagroup/ChemBench""" NAME = "ChemBench" DATASET_PATH = "jablonkagroup/ChemBench" SAMPLE_SPLIT = "train" # Only has train split FEWSHOT_SPLIT = "train" # Only has train split RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood] SUBJECTS = CHEMBENCH_SUBJECTS LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "Fewshot is not supported for ChemBench" super().__init__(num_fewshot) self.keys = get_n_letters(16) def _load_dataset(self, subject: str) -> None: super()._load_dataset(subject) # Keep only the multiple-choice options with 1 correct answer for split in self.dataset.keys(): filtered_items = [] for item in self.dataset[split]: if item.get("metrics") == ["multiple_choice_grade"]: target_scores = json.loads(item["examples"][0]["target_scores"]) correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0] if len(correct_answers) == 1: filtered_items.append(item) self.dataset[split] = filtered_items def _get_subject_name(self, item: dict[str, Any]) -> str: return " ".join(item["subject"].split("_")) def _get_initial_prompt_text(self, item: dict[str, Any]) -> str: return ( "The following is a question about chemistry. Please answer by responding with the letter of the correct " "answer." ) def _get_instruction_text(self, item: dict[str, Any]) -> str: question = item["examples"][0]["input"].strip() target_scores = json.loads(item["examples"][0]["target_scores"]) choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, target_scores.keys())]) return f"Question: {question}\n{choices}" def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: ground_truth = self._get_ground_truth(item) return f"{self._get_cue_text(item)}{ground_truth}" def _get_cue_text(self, item: dict[str, Any]) -> str: return "Answer:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: target_scores = json.loads(item["examples"][0]["target_scores"]) correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0] assert len(correct_answers) == 1, f"Expected exactly one correct answer, but got {len(correct_answers)}" return f" {self.keys[correct_answers[0]]}" def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: target_scores = json.loads(item["examples"][0]["target_scores"]) return [f" {key}" for key in self.keys[: len(target_scores)]]