Source code for eval_framework.tasks.benchmarks.aidanbench

from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Union

from eval_framework.metrics.completion.aidanbench import AidanBenchMetric
from eval_framework.metrics.llm.graders.coherence_grader import CoherenceGrader
from eval_framework.metrics.llm.graders.language import Language as LLMLanguage
from eval_framework.shared.types import Completion
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, ResponseType, Sample
from eval_framework.tasks.base import Language as TaskLanguage
from eval_framework.utils.helpers import pairwise_cosine_similarity
from template_formatting.formatter import Message, Role

if TYPE_CHECKING:
    from eval_framework.llm.base import BaseLLM
    from eval_framework.shared.types import Error


COHERENCE_THRESHOLD = 15
NOVELTY_THRESHOLD = 0.15


[docs] class AidanBenchOriginal(BaseTask[str]): """AidanBench (https://openreview.net/pdf?id=fz969ahcvJ).""" NAME = "AidanBench" DATASET_PATH = "Aleph-Alpha-Research/aidanbench" SAMPLE_SPLIT = "train" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [AidanBenchMetric] SUBJECTS = [NO_SUBJECT] LANGUAGE = {NO_SUBJECT: TaskLanguage.ENG} def __init__(self, num_fewshot: int = 0) -> None: from eval_framework.llm.openai import OpenAIEmbeddingModel, OpenAIModel super().__init__(num_fewshot) assert num_fewshot == 0, "AidanBench does not support few-shot prompting." self._coherence_grader = CoherenceGrader(grading_model=OpenAIModel(model_name="gpt-4o-mini")) self._embedding_model = OpenAIEmbeddingModel() def _get_instruction_text(self, item: dict[str, Any]) -> str: item_prompt = item["prompt"] # note the extra dot after colon. We take this from the original AidanBench code: # https://github.com/aidanmclaughlin/AidanBench/blob/a6bb3253ff630c82e7adbc81ce7bc7184c5bd881/benchmark/prompts.py#L7 # noqa: E501 base_prompt = ( "Answer the following question:.\n" "<question>" + item_prompt + "</question>\n" "Provide your answer in <answer></answer> XML tags.\n" ) base_prompt += ( "Your response should be one direct answer. " "Only provide one answer. DO NOT list multiple answers. Please try to be concise.\n" ) return base_prompt def _get_ground_truth(self, item: dict[str, Any]) -> str | None: return None def _calculate_novelty_score(self, messages: list[Message]) -> float: assert messages[0].role == Role.USER assert all(msg.role != Role.USER for msg in messages[1:]), "Only the first message should be from USER" messages_without_instruction_ = messages[1:] messages_without_instruction: list[Sequence[Message]] = [ [m] for m in messages_without_instruction_ ] # input format for embedding model if len(messages_without_instruction) == 1: return 1.0 # if there's only one response, it's by definition novel all_embeddings = self._embedding_model.generate_embeddings(messages_without_instruction) new_embedding = all_embeddings[-1] previous_embeddings = all_embeddings[:-1] similarities = pairwise_cosine_similarity([new_embedding], previous_embeddings) assert len(similarities) == 1 similarities_squeezed = similarities[0] # "squeeze" assert len(similarities_squeezed) == len(previous_embeddings) return 1 - max(similarities_squeezed) def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: return [] def _fuse_messages(self, messages: list[Message]) -> list[Message]: """ Takes a list of messages and fuses them into a single message: A USER message that also contains all previous model responses, wrapped for the next iterative generation step. """ assert len(messages) >= 2, "There must be at least one USER and one ASSISTANT message" assert messages[0].role == Role.USER assert all(msg.role == Role.ASSISTANT for msg in messages[1:]), "Only the first message should be from USER" instruction_message = messages[0].content previous_answers = [msg.content for msg in messages[1:]] previous_answers_str = "\n\n".join( [ f"<previous_answer id='{i + 1}'>\n{answer}\n</previous_answer>" for i, answer in enumerate(previous_answers) ] ) instruction_message += ( "IMPORTANT: Provide an answer you *HAVE NOT* given previously.\n" "Your previous answers are inside of <previous_answers></previous_answers> XML tags.\n" "<previous_answers>\n" + previous_answers_str + "\n</previous_answers>" ) return [Message(role=Role.USER, content=instruction_message)] def _generation_loop( self, llm: "BaseLLM", stop_sequences: list[str] | None, max_tokens: int | None, initial_samples: list[Sample] ) -> tuple[list[list[Message]], list[Union["Error", None]]]: initial_messages = [s.messages for s in initial_samples] samples = [(s, False) for s in initial_samples] # (sample, is_done) message_history = [msg for msg in initial_messages] # to keep track of all iterative model responses errors: list[Error | None] = [None for _ in message_history] while not all(is_done for _, is_done in samples): # iterative generation loop not_done_idx = [i for i, (_, is_done) in enumerate(samples) if not is_done] new_completions = super().generate_completions( llm, [samples[i][0] for i in not_done_idx], stop_sequences=stop_sequences, max_tokens=max_tokens, ) new_completion_messages: list[list[Message] | None] = [c.messages for c in new_completions] new_errors = [c.error for c in new_completions] new_samples = [s for s in samples] for idx, completion_msgs, error in zip(not_done_idx, new_completion_messages, new_errors): old_sample = samples[idx][0] if completion_msgs is not None: message_history[idx].append(completion_msgs[-1]) # add latest model response to history errors[idx] = error assert completion_msgs[0].role == Role.USER and completion_msgs[-1].role == Role.ASSISTANT coherence_score = self._coherence_grader.grade( instruction=old_sample.messages[0].content, # only pass initial instruction completion=completion_msgs[-1].content, language=LLMLanguage(iso_639_1="en"), ).coherence_score else: coherence_score = 0 # if no completion, treat as non-coherent novelty_score = self._calculate_novelty_score(message_history[idx]) fused_message = self._fuse_messages(message_history[idx]) new_sample = Sample( id=old_sample.id, subject=old_sample.subject, ground_truth=old_sample.ground_truth, messages=fused_message, context=old_sample.context, possible_completions=old_sample.possible_completions, ) if coherence_score < COHERENCE_THRESHOLD or novelty_score < NOVELTY_THRESHOLD: # Fail! Stop generating new_samples[idx] = (new_sample, True) else: # Continue generating new_samples[idx] = (new_sample, False) samples = new_samples return message_history, errors
[docs] def generate_completions( self, llm: "BaseLLM", samples: list[Sample], stop_sequences: list[str] | None = None, max_tokens: int | None = None, ) -> list[Completion]: assert all(len(s.messages) == 1 and s.messages[0].role == Role.USER for s in samples), ( "Each sample must have exactly one USER message." ) all_message_histories, errors = self._generation_loop(llm, stop_sequences, max_tokens, samples) completion_list = [] for idx, sample in enumerate(samples): messages = all_message_histories[idx] error = errors[idx] completion_list.append( Completion( id=sample.id, subject=sample.subject, ground_truth=sample.ground_truth, prompt=sample.messages[0].content, prompt_sequence_positions=None, concat_compression=None, messages=messages, completion="".join([msg.content for msg in messages if msg.role == Role.ASSISTANT]), raw_completion="".join([msg.content for msg in messages if msg.role == Role.ASSISTANT]), raw_completion_sequence_positions=None, context=sample.context, error=error, ) ) return completion_list
[docs] class AidanBench(AidanBenchOriginal): def _get_instruction_text(self, item: dict[str, Any]) -> str: item_prompt = item["prompt"] # We correct the prompt here by removing the extra dot after the colon. base_prompt = ( "Answer the following question:\n" "<question>" + item_prompt + "</question>\n" "Provide your answer in <answer></answer> XML tags.\n" ) base_prompt += ( "Your response should be one direct answer. " "Only provide one answer. DO NOT list multiple answers. Please try to be concise.\n" ) return base_prompt