Source code for eval_framework.metrics.aggregators.aggregators

from typing import Any, Protocol

import numpy as np
import pandas as pd
from scipy.special import comb



[docs]
class Aggregator(Protocol):
    """Base class for metric aggregators.

    An aggregator collapses multiple evaluation rows for the same problem (i.e. prompt) into a
    single score per problem. The input DataFrame has one row per (problem, attempt)
    pair; the output has one row per problem with a new ``value``.

    Args:
        response_df: DataFrame where each row is one evaluation attempt. Must contain
            a ``value`` column (the per-attempt score) and all ``identifier_columns``.
        identifier_columns: Columns that uniquely identify a problem (e.g. ``["prompt"]``).
            Rows sharing the same identifier are different attempts at the same problem.

    Returns:
        DataFrame with one row per unique problem and a ``value`` column holding
        the aggregated score. All non-identifier, non-value columns are preserved
        (typically via ``"first"``).

    Example input (``identifier_columns=["prompt"]``, 3 attempts per problem):

        | prompt         | value | subject |
        |----------------|-------|---------|
        | "What is 2+2?" |  1.0  | algebra |
        | "What is 2+2?" |  1.0  | algebra |
        | "What is 2+2?" |  0.0  | algebra |
        | "Solve x^2=4"  |  0.0  | algebra |
        | "Solve x^2=4"  |  1.0  | algebra |
        | "Solve x^2=4"  |  0.0  | algebra |
    """

    name: str

    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame: ...




[docs]
def closed_form_passatk(n: int, c: int, k: int) -> float:
    """Closed-form pass@k estimator (see HumanEval paper).

    pass@k = 1 - C(n-c, k) / C(n, k)

    Given n total samples with c correct, this is the probability that at least one of k
    randomly chosen samples is correct. The ratio C(n-c,k)/C(n,k) is the chance all k picks
    are wrong; subtracting from 1 gives success probability. When n-c < k there aren't enough
    wrong samples to fill k slots, so the result is trivially 1.
    """
    if n < k:
        return 1.0 if c > 0 else 0.0
    if n - c < k:
        return 1.0
    return 1.0 - comb(n - c, k, exact=False) / comb(n, k, exact=False)




[docs]
class PassAtK(Aggregator):
    """Computes pass@k: the probability that at least one of k random attempts is correct.

    Groups rows by ``identifier_columns``, counts correct (``c = sum(value)``) and
    total (``n = count(value)``) attempts per problem, then applies the closed-form
    estimator.

    Expects ``value`` to be binary (0 or 1). For k=1 this is equivalent to the mean.

    Example (k=2, continuing from the Aggregator docstring example):
        "What is 2+2?": n=3, c=2, k=2 -> 1.0  (guaranteed correct pick)
        "Solve x^2=4":  n=3, c=1,  k=2 -> 0.667 (as computed by the `closed_form_passatk`)

        Output:
        | prompt         | value | subject |
        |----------------|-------|---------|
        | "What is 2+2?" | 1.000 | algebra |
        | "Solve x^2=4"  | 0.667 | algebra |
    """

    def __init__(self, k: int = 1) -> None:
        self.k = k
        self.name = f"Pass@{k}"

    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
        # agg_dict decides how each column (`agg_dict` key) will get aggregated (`agg_dict` value).
        # For the `value` column, we compute both the sum and the count, for all other columns we simply pick the first
        # entry (as they are identical anyway).
        other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
        agg_dict = {"value": ["sum", "count"], **{c: "first" for c in other_cols}}
        agg = response_df.groupby(identifier_columns).agg(agg_dict)
        # flatten multi-index columns from value agg: ("value", "sum") / ("value", "count")
        c = agg[("value", "sum")].values
        n = agg[("value", "count")].values
        scores = np.array([closed_form_passatk(n_i, c_i, self.k) for n_i, c_i in zip(n, c)])
        out = agg.drop(columns=[("value", "sum"), ("value", "count")])
        if isinstance(out.columns, pd.MultiIndex):
            out.columns = out.columns.droplevel(1)
        return out.assign(value=scores).reset_index()




[docs]
class IdentifierMean(Aggregator):
    """Computes the arithmetic mean of ``value`` across attempts per problem.

    Example (continuing from the Aggregator docstring example):

        "What is 2+2?": mean(1.0, 1.0, 0.0) = 0.667
        "Solve x^2=4":  mean(0.0, 1.0, 0.0) = 0.333

        Output:
        | prompt         | value | subject |
        |----------------|-------|---------|
        | "What is 2+2?" | 0.667 | algebra |
        | "Solve x^2=4"  | 0.333 | algebra |
    """

    def __init__(self) -> None:
        self.name = "IdentifierMean"

    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
        agg_dict = {
            "value": "mean",
        }
        other_cols = [c for c in response_df.columns if c not in identifier_columns and c != "value"]
        agg_dict.update({c: "first" for c in other_cols})
        return response_df.groupby(identifier_columns).agg(agg_dict).reset_index()




[docs]
class Identity:
    """No-op aggregator — returns the input unchanged.

    Use for metrics where each row is already a final score and no cross-attempt
    aggregation is needed (e.g. when ``num_samples=1``).
    """

    def __init__(self) -> None:
        self.name = "Identity"

    def __call__(self, response_df: pd.DataFrame, identifier_columns: list[str], **kwargs: Any) -> pd.DataFrame:
        return response_df