Source code for eval_framework.tasks.benchmarks.zero_scrolls
import re
from typing import Any
from eval_framework.metrics.completion.exponential_similarity import ExponentialSimilarity
from eval_framework.metrics.completion.f1 import F1
from eval_framework.metrics.completion.rouge_geometric_mean import ROUGE_GEOMETRIC_MEAN
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
AccuracyLoglikelihood,
)
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
from eval_framework.tasks.utils import get_n_letters
[docs]
class ZERO_SCROLLS_QUALITY(BaseTask[str]):
"""ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
NAME = "ZeroSCROLLS QuALITY"
DATASET_PATH = "tau/zero_scrolls"
SAMPLE_SPLIT = "validation"
FEWSHOT_SPLIT = "validation"
RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
METRICS = [AccuracyLoglikelihood]
SUBJECTS = ["quality"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
LANGUAGE = Language.ENG
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS QuALITY only supports zero fewshot examples"
super().__init__(num_fewshot)
self.keys = get_n_letters(4)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\n"
def _get_cue_text(self, item: dict[str, Any]) -> str:
return "Answer:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return f" {item['output']}"
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
return [f" {key}" for key in self.keys]
[docs]
class ZERO_SCROLLS_COMPLETION(BaseTask[str]):
"""ZeroSCROLLS dataset: https://huggingface.co/datasets/tau/zero_scrolls"""
DATASET_PATH = "tau/zero_scrolls"
SAMPLE_SPLIT = "validation"
FEWSHOT_SPLIT = "validation"
RESPONSE_TYPE = ResponseType.COMPLETION
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return item["output"]
[docs]
class ZERO_SCROLLS_GOV_REPORT(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS GovReport"
METRICS = [ROUGE_GEOMETRIC_MEAN]
SUBJECTS = ["gov_report"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Summary"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS GovReport only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}Summary:"
[docs]
class ZERO_SCROLLS_QMSUM(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS QMSum"
METRICS = [ROUGE_GEOMETRIC_MEAN]
SUBJECTS = ["qmsum"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS QMSum only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\nAnswer:"
[docs]
class ZERO_SCROLLS_SQUALITY(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS SQuALITY"
METRICS = [ROUGE_GEOMETRIC_MEAN]
SUBJECTS = ["squality"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS SQuALITY only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\nAnswer:"
[docs]
class ZERO_SCROLLS_QASPER(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS Qasper"
METRICS = [F1]
SUBJECTS = ["qasper"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS Qasper only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\nAnswer:"
[docs]
class ZERO_SCROLLS_NARRATIVEQA(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS NarrativeQA"
METRICS = [F1]
SUBJECTS = ["narrative_qa"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS NarrativeQA only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\nAnswer:"
[docs]
class ZERO_SCROLLS_MUSIQUE(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS MuSiQue"
METRICS = [F1]
SUBJECTS = ["musique"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS MuSiQue only supports zero fewshot examples"
super().__init__(num_fewshot)
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}\n\nAnswer:"
[docs]
class ZERO_SCROLLS_SPACE_DIGEST(ZERO_SCROLLS_COMPLETION):
NAME = "ZeroSCROLLS SpaceDigest"
METRICS = [ExponentialSimilarity]
SUBJECTS = ["space_digest"]
PERTURBATION_UNMODIFIABLE_WORDS = ["Answer"]
def __init__(self, num_fewshot: int = 0) -> None:
assert num_fewshot == 0, "ZeroSCROLLS SpaceDigest only supports zero fewshot examples"
super().__init__(num_fewshot)
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
# First, try to find patterns like "X%" or "X percent" or "X percentage"
percentage_patterns = [
r"(\d+(?:\.\d+)?)%", # Matches: 30%, 30.5%
r"(\d+(?:\.\d+)?)\s*percent", # Matches: 30 percent, 30.5 percent
r"(\d+(?:\.\d+)?)\s*percentage", # Matches: 30 percentage, 30.5 percentage
r"percentage\s*(?:is|of|:)?\s*(\d+(?:\.\d+)?)", # Matches: percentage is 30, percentage: 30.5
r"(?:is|equals|equal to|about|approximately|around|roughly)\s*(\d+(?:\.\d+)?)\s*%",
# Matches: is 30%, equals 30.5%
r"(?:is|equals|equal to|about|approximately|around|roughly)\s*(\d+(?:\.\d+)?)\s*percent",
# Matches: is 30 percent
r"it'?s\s*(\d+(?:\.\d+)?)", # Matches: it's 60, its 60
r"that'?s\s*(\d+(?:\.\d+)?)", # Matches: that's 60, thats 60
]
for pattern in percentage_patterns:
match = re.search(pattern, completion_text, re.IGNORECASE)
if match:
return match.group(1).strip()
# If no percentage pattern is found, check if the entire text is just a number
if re.fullmatch(r"\s*(\d+(?:\.\d+)?)\s*", completion_text):
return completion_text.strip()
# If not a standalone number, look for any number in the text
# This is a fallback and might be less accurate
number_match = re.search(r"(\d+(?:\.\d+)?)", completion_text)
if number_match:
return number_match.group(1).strip()
# If no number is found, return the original text stripped
return completion_text.strip()
def _get_instruction_text(self, item: dict[str, Any]) -> str:
query_end_index = item["query_end_index"]
return f"{item['input'][:query_end_index]}Answer:"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
return self.post_process_generated_completion(item["output"])