Source code for eval_framework.tasks.benchmarks.pawsx

from typing import Any

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample



[docs]
class PAWSX(BaseTask[str]):
    """PAWSX dataset: https://huggingface.co/datasets/google-research-datasets/paws-x
    used in the way suggested in PARAPHRASUS benchmark (https://arxiv.org/pdf/2409.12060)."""

    NAME = "PAWS-X"
    DATASET_PATH = "google-research-datasets/paws-x"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "validation"
    RESPONSE_TYPE = ResponseType.COMPLETION  # LOGLIKELIHOODS would also make sense but staying true to PARAPHRASUS
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["en", "de"]  # ["es", "fr", "ja", "ko", "zh"] -- disabled as irrelevant for the time being
    PERTURBATION_UNMODIFIABLE_WORDS = ["Ja", "Nein", "Paraphrasen", "Yes", "No", "paraphrases"]
    LANGUAGE = {"en": Language.ENG, "de": Language.DEU}

    def __init__(self, num_fewshot: int = 0) -> None:
        self.num_fewshot = num_fewshot

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        # PARAPHRASUS seems to use English prompt for all languages but that's a bit weird, let's do it properly.
        match item["subject"]:
            case "de":
                return (
                    "Sind die folgenden Sätze Paraphrasen?\n"
                    f"Satz 1: {item['sentence1']}\n"
                    f"Satz 2: {item['sentence2']}\n"
                    "Antworte mit 'Ja' oder 'Nein'.\n"
                )
            case _:
                # Please translate to other language as necessary
                return (
                    "Are the following sentences paraphrases?\n"
                    f"Sentence 1: {item['sentence1']}\n"
                    f"Sentence 2: {item['sentence2']}\n"
                    "Answer with 'Yes' or 'No'.\n"
                )

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        match item["subject"]:
            case "de":
                return "Ja" if item["label"] == "1" else "Nein"
            case _:
                # Please translate to other language as necessary
                return "Yes" if item["label"] == "1" else "No"


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        return completion_text.strip().strip("\"'.")


    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
        # Note that this, together with BaseTask._get_messages(), produces a different prompt structure than
        # what PARAPHRASUS suggests in Figure 4. But both seem approaches are somehow valid...
        examples: list[dict] = []
        for _ in range(1000):
            example = self.rnd.choice(self.dataset[self.FEWSHOT_SPLIT])
            # Ensure half of the examples is negative and half positive.
            if example["label"] == (len(examples) % 2) and example not in examples:
                examples.append(example)
            if len(examples) >= self.num_fewshot:
                break
        return examples