Source code for eval_framework.tasks.benchmarks.goldenswag
from typing import Any
from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
)
from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
from eval_framework.metrics.loglikelihood.ternary import TernaryScore
from eval_framework.tasks.benchmarks.hellaswag import HELLASWAG
[docs]
class GOLDENSWAG(HELLASWAG):
"""GoldenSwag dataset: https://huggingface.co/datasets/PleIAs/GoldenSwag
available data set sections: validation"""
NAME = "GoldenSwag"
DATASET_PATH = "PleIAs/GoldenSwag"
SAMPLE_SPLIT = "validation"
FEWSHOT_SPLIT = "validation"
[docs]
class GOLDENSWAG_IDK(GOLDENSWAG):
NAME = "GoldenSwag_IDK"
METRICS = [
AccuracyLoglikelihood,
AccuracyNormLoglikelihood,
ConfidenceWeightedAccuracy,
DistributionalCorrectnessScore,
TernaryScore,
]
def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
return (
"Complete the sentence only if you are confident, since mistakes may be penalised, while correct "
"completions receive points. It is acceptable to answer with 'I do not know' if you are unsure, "
"and you will receive 0 points."
)
def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
completions = super()._get_possible_completions(item)
return (completions or []) + [" I do not know."]