Source code for eval_framework.tasks.task_style

"""Task-style helpers and strategy classes for choice-based evaluation tasks.

This module provides injectable styling strategies — ``MCStyle`` and
``ClozeStyle`` — that reduce boilerplate in multiple-choice and cloze
evaluation tasks.  Using a task styler is entirely optional; existing tasks that
override ``BaseTask`` methods directly continue to work unchanged.

Quick-start
-----------
A new choice-based task sets ``TASK_STYLER`` on its ``BaseTask`` subclass and
implements **three data-access methods**:

* ``_get_raw_question(item) -> str``   — the bare question string
* ``_get_choices(item) -> list[str]``  — ordered list of answer options
* ``_get_correct_index(item) -> int``  — 0-based index of the correct answer

``BaseTask`` automatically delegates its styling hooks to the task styler.
``RESPONSE_TYPE`` and ``METRICS`` are read from the styler by callers that need
them (e.g. ``EvaluationGenerator``).

.. code-block:: python

    class MyTask(BaseTask[str]):
        NAME = "MyTask"
        DATASET_PATH = "my/dataset"
        SAMPLE_SPLIT = "test"
        FEWSHOT_SPLIT = "train"
        SUBJECTS = ["my_subject"]
        PERTURBATION_UNMODIFIABLE_WORDS = ["Question"]
        TASK_STYLER = ClozeStyle(question_prefix="Question: ", cue_text="Answer:")

        def _get_raw_question(self, item): return item["question"]
        def _get_choices(self, item): return item["choices"]
        def _get_correct_index(self, item): return item["answer_idx"]

For task families with both MC and Cloze variants, a shared base class holds the
dataset attributes and data-access methods.  Variants only differ in ``TASK_STYLER``:

.. code-block:: python

    class _ARC_Base(BaseTask[str]):
        DATASET_PATH = "allenai/ai2_arc"
        ...
        def _get_raw_question(self, item): return item["question"]
        def _get_choices(self, item): return item["choices"]["text"]
        def _get_correct_index(self, item): ...

    class ARC(_ARC_Base):
        NAME = "ARC"
        TASK_STYLER = ClozeStyle()

    class ARC_MC(_ARC_Base):
        NAME = "ARC_MC"
        TASK_STYLER = MCStyle(space_prefixed_labels=True)

    class ARC_BPB(_ARC_Base):
        NAME = "ARC_BPB"
        TASK_STYLER = BPBStyle()
"""

import hashlib
import random
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Self

from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
    AccuracyLoglikelihood,
    AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import Language, ResponseType, TaskStyle
from eval_framework.tasks.utils import get_n_letters

if TYPE_CHECKING:
    from eval_framework.metrics.base import BaseMetric

# Default (question_prefix, cue_text) per language; extend for new languages as needed.
_DEFAULT_QUESTION_CUE_TEXT: dict[Language, tuple[str, str]] = {
    Language.ENG: ("Question: ", "Answer:"),
    Language.DEU: ("Frage: ", "Antwort:"),
}


# ---------------------------------------------------------------------------
# Task styler strategy classes
# ---------------------------------------------------------------------------



[docs]
class TaskStyler(ABC):
    """Strategy object that controls prompt assembly and scoring for choice-based tasks.

    Concrete implementations (``MCStyle``, ``ClozeStyle``) are assigned to a
    task's ``TASK_STYLER`` class attribute.  ``BaseTask`` delegates its
    styling hooks to this object, so task authors only implement data-access
    methods.

    Attributes:
        response_type: The response type the task should use (e.g. LOGLIKELIHOODS).
        metrics:       Default metric classes for tasks using this styler.
        task_style:    Discriminator for metadata (MULTIPLE_CHOICE or CLOZE).
        question_prefix: String prepended to the raw question.
    """

    response_type: ResponseType
    metrics: list[type["BaseMetric"]]
    task_style: TaskStyle
    question_prefix: str


[docs]
    @abstractmethod
    def get_instruction_text(self, raw_question: str, choices: list[str]) -> str:
        """Build the instruction/prompt text from a question and answer choices."""



[docs]
    @abstractmethod
    def get_ground_truth(self, choices: list[str], correct_index: int) -> str:
        """Return the ground-truth string for scoring."""



[docs]
    @abstractmethod
    def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
        """Return the list of completion strings to be evaluated.

        ``correct_index`` is only required by ``BPBStyle``, which scores solely the
        ground-truth completion. ``MCStyle`` and ``ClozeStyle`` score all choices and
        ignore it; callers may omit it when using those stylers.
        """



[docs]
    @abstractmethod
    def get_cue_text(self) -> str:
        """Return the assistant cue appended after the prompt (e.g. ``"Answer:"``)."""



[docs]
    def get_question_text(self, raw_question: str) -> str:
        """Build the full question line (prefix + raw question).

        Override in a subclass for non-standard question formats (e.g. HellaSwag's
        ``"activity: context"`` form).
        """
        return f"{self.question_prefix}{raw_question}"



[docs]
    def get_fewshot_target_text(self, choices: list[str], correct_index: int) -> str:
        """Return the few-shot target string (cue + ground truth)."""
        return f"{self.get_cue_text()}{self.get_ground_truth(choices, correct_index)}"



[docs]
    def get_extra_metadata(self) -> dict:
        """Return styler-specific metadata to merge into the task's metadata."""
        return {"task_style": self.task_style.value}



[docs]
    @classmethod
    def for_language(cls, language: Language, **kwargs: Any) -> Self:
        """Factory that fills ``question_prefix`` and ``cue_text`` from language defaults.

        Any explicitly passed keyword arguments take precedence over the defaults.
        """
        if language in _DEFAULT_QUESTION_CUE_TEXT:
            prefix, cue = _DEFAULT_QUESTION_CUE_TEXT[language]
            kwargs.setdefault("question_prefix", prefix)
            kwargs.setdefault("cue_text", cue)
        return cls(**kwargs)





[docs]
class MCStyle(TaskStyler):
    """Multiple-choice styler: choices shown in prompt, model scored over letter labels.

    Args:
        question_prefix:        Prepended to the raw question (default ``"Question: "``).
        cue_text:               Assistant cue after the prompt (default ``"Answer:"``).
        space_prefixed_labels:  When ``True``, each option line starts with a space
                                (``" A. choice"`` — OLMES-style). Default ``False``.

    Assembled prompt example (default settings, 3 choices)::

        "Question: What is the capital of France?\\nA. Berlin\\nB. Paris\\nC. London\\n"

        Scored completions: [" A", " B", " C"]
        Ground truth:  " B"
    """

    response_type = ResponseType.LOGLIKELIHOODS
    metrics: list[type["BaseMetric"]] = [
        AccuracyLoglikelihood,
        AccuracyNormLoglikelihood,
        BitsPerByteLoglikelihood,
    ]
    task_style = TaskStyle.MULTIPLE_CHOICE

    def __init__(
        self,
        question_prefix: str = "Question: ",
        cue_text: str = "Answer:",
        space_prefixed_labels: bool = False,
    ) -> None:
        self.question_prefix = question_prefix
        self._cue_text = cue_text
        self.space_prefixed_labels = space_prefixed_labels


[docs]
    def get_cue_text(self) -> str:
        return self._cue_text



[docs]
    def get_instruction_text(self, raw_question: str, choices: list[str]) -> str:
        return format_mc_prompt(
            self.get_question_text(raw_question),
            choices,
            space_prefixed_labels=self.space_prefixed_labels,
        )



[docs]
    def get_ground_truth(self, choices: list[str], correct_index: int) -> str:
        labels = get_n_letters(len(choices))
        return f" {labels[correct_index]}"



[docs]
    def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
        """Note: `correct_index` is ignored for `MCStyle` and only used for `BPBStyle`."""
        return [f" {label}" for label in get_n_letters(len(choices))]





[docs]
class ClozeStyle(TaskStyler):
    """Cloze styler: no choices in prompt, model scored over full choice text.

    Also known as "ranked classification" (RC).  The prompt only shows the question;
    the model's score for each full answer text determines the prediction.

    Args:
        question_prefix:   Prepended to the raw question (default ``"Question: "``).
        cue_text:          Assistant cue after the prompt (default ``"Answer:"``).
        trailing_newline:  When ``True`` (default), the instruction ends with ``"\\n"``.
                           Set to ``False`` for sentence-completion tasks where the
                           model should continue a fragment directly.

    Assembled prompt example (3 choices)::

        "Question: What is the capital of France?\\n"

        Scored completions: [" Berlin", " Paris", " London"]
        Ground truth: " Paris"

    Sentence-completion example (trailing_newline=False, cue_text="")::

        "The cat sat on the"

        Scored completions: [" mat", " floor", " sofa"]
        Ground truth: " mat"
    """

    response_type = ResponseType.LOGLIKELIHOODS
    metrics: list[type["BaseMetric"]] = [
        AccuracyLoglikelihood,
        AccuracyNormLoglikelihood,
        BitsPerByteLoglikelihood,
    ]
    task_style = TaskStyle.CLOZE

    def __init__(
        self,
        question_prefix: str = "Question: ",
        cue_text: str = "Answer:",
        trailing_newline: bool = True,
        leading_space_continuations: bool = True,
    ) -> None:
        self.question_prefix = question_prefix
        self._cue_text = cue_text
        self.trailing_newline = trailing_newline
        self.leading_space_continuations = leading_space_continuations


[docs]
    def get_cue_text(self) -> str:
        return self._cue_text



[docs]
    def get_instruction_text(self, raw_question: str, choices: list[str]) -> str:
        text = self.get_question_text(raw_question)
        return f"{text}\n" if self.trailing_newline else text



[docs]
    def get_ground_truth(self, choices: list[str], correct_index: int) -> str:
        return f" {choices[correct_index]}" if self.leading_space_continuations else choices[correct_index]



[docs]
    def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
        return [f" {c}" for c in choices] if self.leading_space_continuations else [f"{c}" for c in choices]





[docs]
class BPBStyle(ClozeStyle):
    """BPB-only styler: prompt identical to ClozeStyle, but scores only the ground-truth completion.

    One LLM forward pass per sample instead of N (one per choice), making evaluation
    significantly faster when accuracy metrics are not needed.

    Args:
        question_prefix:   Prepended to the raw question (default ``"Question: "``).
        cue_text:          Assistant cue after the prompt (default ``"Answer:"``).
        trailing_newline:  When ``True`` (default), the instruction ends with ``"\\n"``.

    Assembled prompt example (3 choices)::

        "Question: What is the capital of France?\\n"

        Scored completions: [" Paris"]  ← ground truth only, one forward pass
        Ground truth:        " Paris"
    """

    metrics: list[type["BaseMetric"]] = [BitsPerByteLoglikelihood]
    task_style = TaskStyle.BPB


[docs]
    def get_possible_completions(self, choices: list[str], correct_index: int | None = None) -> list[str]:
        if correct_index is None:
            raise ValueError(
                "BPBStyle evaluates the loglikelihood of the ground truth answer only,"
                "and thus requires the correct index."
            )
        return [f" {choices[correct_index]}"] if self.leading_space_continuations else [choices[correct_index]]




# ---------------------------------------------------------------------------
# Helper functions
# ---------------------------------------------------------------------------



[docs]
def shuffle_correct_with_distractors(
    correct: str,
    distractors: list[str],
    seed_text: str,
) -> tuple[list[str], int]:
    """Deterministically shuffle distractors + correct answer; return (choices, correct_index).

    Calling this multiple times with the same arguments always returns the same result.

    Args:
        correct:     The correct answer string.
        distractors: The distractor strings, e.g. wrong answer choices.
        seed_text:   Text used as the shuffle seed (typically question + answer).

    Returns:
        A tuple ``(shuffled_choices, correct_index)`` where ``correct_index``
        is the 0-based position of ``correct`` in ``shuffled_choices``.
    """
    choices = [*distractors, correct]
    seed = int(hashlib.sha256(seed_text.encode()).hexdigest(), 16)
    rng = random.Random(seed)
    order = list(range(len(choices)))
    rng.shuffle(order)
    shuffled = [choices[i] for i in order]
    correct_index = order.index(len(choices) - 1)
    return shuffled, correct_index




[docs]
def answer_key_to_index(key: str) -> int:
    """Convert a letter or 1-based integer answer key to a 0-based index.

    Datasets sometimes encode the correct answer as a letter ("A", "B", ...)
    and sometimes as a 1-based integer string ("1", "2", ...).  This function
    normalises both to a 0-based index so task code doesn't need to branch.

    Args:
        key: A single-character string. Either a letter ("A"-"Z") or a digit ("1"-"9").

    Returns:
        0-based index: "A" or "1" → 0, "B" or "2" → 1, etc.
    """
    if key.isdigit():
        return int(key) - 1  # Shift 1-based integer by 1.
    return ord(key.upper()) - ord("A")  # Turn letter to 0-based index.




[docs]
def format_mc_prompt(
    question_text: str,
    choices: list[str],
    *,
    space_prefixed_labels: bool = False,
) -> str:
    """Helper function to format a question and its labeled choices into a multiple-choice prompt.

    The choices are labeled A, B, C, ... in order and ends with a newline.

    Args:
        question_text:        The full question string (prefix already included).
        choices:              Ordered list of answer option strings.
        space_prefixed_labels: When ``True``, each option line is prefixed with a
                              space — " A. choice" instead of "A. choice".
                              This is, e.g., the OLMES-style prompt format.

    Returns:
        A string of the form ``"<question_text>\\n[pfx]A. choice0\\n[pfx]B. choice1\\n"``.

    Examples::

        >>> format_mc_prompt("Question: What is 1+1?", ["1", "2", "3"])
        'Question: What is 1+1?\\nA. 1\\nB. 2\\nC. 3\\n'
        >>> format_mc_prompt("Question: What is 1+1?", ["1", "2"], space_prefixed_labels=True)
        'Question: What is 1+1?\\n A. 1\\n B. 2\\n'
    """
    labels = get_n_letters(len(choices))
    pfx = " " if space_prefixed_labels else ""
    options = "\n".join(f"{pfx}{label}. {choice}" for label, choice in zip(labels, choices))
    return f"{question_text}\n{options}\n"