Source code for eval_framework.tasks.benchmarks.ifeval
from typing import Any
from eval_framework.metrics.completion.ifeval import IFEvalMetric, IFEvalMetricContext
from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
[docs]
class IFEval(BaseTask[str]):
"""IFEval: Instruction Following Eval (https://arxiv.org/pdf/2311.07911)."""
NAME = "IFEval"
DATASET_PATH = "google/IFEval"
SAMPLE_SPLIT = "train"
FEWSHOT_SPLIT = "train"
RESPONSE_TYPE = ResponseType.COMPLETION
METRICS = [IFEvalMetric]
SUBJECTS = [NO_SUBJECT]
LANGUAGE = {NO_SUBJECT: Language.ENG}
def __init__(self, num_fewshot: int = 0) -> None:
super().__init__(num_fewshot)
assert num_fewshot == 0, "IFEval does not support few-shot prompting."
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return item["prompt"]
def _get_context(self, item: dict[str, Any]) -> IFEvalMetricContext:
assert "key" in item, "Expected 'key' in item"
assert "instruction_id_list" in item, "Expected 'instruction_id_list' in item"
assert "prompt" in item, "Expected 'prompt' in item"
assert "kwargs" in item, "Expected 'kwargs' in item"
new_kwargs = []
for d in item["kwargs"]:
# fixing undesired float fields in the dataset
assert all([abs(v - float(v)) < 1e-5 for v in d.values() if isinstance(v, float)])
new_kwargs.append({k: v if not isinstance(v, float) else int(v) for k, v in d.items()})
# fixing changes to the HF dataset done on Apr 10 2025
if item["key"] == 142:
new_kwargs[2]["relation"] = None
new_kwargs[2]["frequency"] = None
new_kwargs[2]["keywords"] = new_kwargs[2]["keyword"]
del new_kwargs[2]["keyword"]
if item["key"] == 1512:
new_kwargs[0]["relation"] = None
item["kwargs"] = new_kwargs
return IFEvalMetricContext(
key=item["key"],
instruction_id_list=item["instruction_id_list"],
prompt=item["prompt"],
additional_kwargs=item["kwargs"],
)
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return None
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
return []
[docs]
class IFEvalFiSv(IFEval):
"""Machine translated versions of the Instruction Following Evaluation (IFEval) benchmark."""
NAME = "IFEval Finnish & Swedish"
DATASET_PATH = "LumiOpen/ifeval_mt"
SUBJECTS = ["fi", "sv"]
LANGUAGE = {"fi": Language.FIN, "sv": Language.SWE}
[docs]
class IFEvalDe(IFEval):
"""German version of the Instruction Following Evaluation (IFEval) benchmark."""
NAME = "IFEval German"
DATASET_PATH = "jzhang86/de_ifeval"
SUBJECTS = [NO_SUBJECT]
LANGUAGE = {NO_SUBJECT: Language.DEU}