Source code for eval_framework.tasks.benchmarks.chembench
import json
from typing import Any
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType
from eval_framework.tasks.utils import get_n_letters
CHEMBENCH_SUBJECTS = [
"analytical_chemistry",
"chemical_preference",
"general_chemistry",
"inorganic_chemistry",
"materials_science",
"organic_chemistry",
"physical_chemistry",
"technical_chemistry",
"toxicity_and_safety",
]
[docs]
class ChemBench(BaseTask[str]):
"""ChemBench dataset: https://huggingface.co/datasets/jablonkagroup/ChemBench"""
NAME = "ChemBench"
DATASET_PATH = "jablonkagroup/ChemBench"
SAMPLE_SPLIT = "train" # Only has train split
FEWSHOT_SPLIT = "train" # Only has train split
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
SUBJECTS = CHEMBENCH_SUBJECTS
LANGUAGE = Language.ENG
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "Fewshot is not supported for ChemBench"
super().__init__(num_fewshot)
self.keys = get_n_letters(16)
def _load_dataset(self, subject: str) -> None:
super()._load_dataset(subject)
# Keep only the multiple-choice options with 1 correct answer
for split in self.dataset.keys():
filtered_items = []
for item in self.dataset[split]:
if item.get("metrics") == ["multiple_choice_grade"]:
target_scores = json.loads(item["examples"][0]["target_scores"])
correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
if len(correct_answers) == 1:
filtered_items.append(item)
self.dataset[split] = filtered_items
def _get_subject_name(self, item: dict[str, Any]) -> str:
return " ".join(item["subject"].split("_"))
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
return (
"The following is a question about chemistry. Please answer by responding with the letter of the correct "
"answer."
)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
question = item["examples"][0]["input"].strip()
target_scores = json.loads(item["examples"][0]["target_scores"])
choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, target_scores.keys())])
return f"Question: {question}\n{choices}"
def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
ground_truth = self._get_ground_truth(item)
return f"{self._get_cue_text(item)}{ground_truth}"
def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Answer:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
target_scores = json.loads(item["examples"][0]["target_scores"])
correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
assert len(correct_answers) == 1, f"Expected exactly one correct answer, but got {len(correct_answers)}"
return f" {self.keys[correct_answers[0]]}"
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
target_scores = json.loads(item["examples"][0]["target_scores"])
return [f" {key}" for key in self.keys[: len(target_scores)]]