Source code for eval_framework.tasks.benchmarks.pawsx
from typing import Any
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
[docs]
class PAWSX(BaseTask[str]):
"""PAWSX dataset: https://huggingface.co/datasets/google-research-datasets/paws-x
used in the way suggested in PARAPHRASUS benchmark (https://arxiv.org/pdf/2409.12060)."""
NAME = "PAWS-X"
DATASET_PATH = "google-research-datasets/paws-x"
SAMPLE_SPLIT = "test"
FEWSHOT_SPLIT = "validation"
RESPONSE_TYPE = ResponseType.COMPLETION # LOGLIKELIHOODS would also make sense but staying true to PARAPHRASUS
METRICS = [AccuracyCompletion]
SUBJECTS = ["en", "de"] # ["es", "fr", "ja", "ko", "zh"] -- disabled as irrelevant for the time being
PERTURBATION_UNMODIFIABLE_WORDS = ["Ja", "Nein", "Paraphrasen", "Yes", "No", "paraphrases"]
LANGUAGE = {"en": Language.ENG, "de": Language.DEU}
def __init__(self, num_fewshot: int = 0) -> None:
self.num_fewshot = num_fewshot
def _get_instruction_text(self, item: dict[str, Any]) -> str:
# PARAPHRASUS seems to use English prompt for all languages but that's a bit weird, let's do it properly.
match item["subject"]:
case "de":
return (
"Sind die folgenden Sätze Paraphrasen?\n"
f"Satz 1: {item['sentence1']}\n"
f"Satz 2: {item['sentence2']}\n"
"Antworte mit 'Ja' oder 'Nein'.\n"
)
case _:
# Please translate to other language as necessary
return (
"Are the following sentences paraphrases?\n"
f"Sentence 1: {item['sentence1']}\n"
f"Sentence 2: {item['sentence2']}\n"
"Answer with 'Yes' or 'No'.\n"
)
def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
match item["subject"]:
case "de":
return "Ja" if item["label"] == "1" else "Nein"
case _:
# Please translate to other language as necessary
return "Yes" if item["label"] == "1" else "No"
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
return completion_text.strip().strip("\"'.")
def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
# Note that this, together with BaseTask._get_messages(), produces a different prompt structure than
# what PARAPHRASUS suggests in Figure 4. But both seem approaches are somehow valid...
examples: list[dict] = []
for _ in range(1000):
example = self.rnd.choice(self.dataset[self.FEWSHOT_SPLIT])
# Ensure half of the examples is negative and half positive.
if example["label"] == (len(examples) % 2) and example not in examples:
examples.append(example)
if len(examples) >= self.num_fewshot:
break
return examples