import os
import random
import re
from typing import Any
from datasets import DatasetDict
from eval_framework.metrics.completion.struct_eval_metrics import (
RenderableStructMetric,
RenderableStructMetricContext,
StructMetric,
StructMetricContext,
)
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
StructEvalSubjects = [
"CSV to YAML",
"JSON to XML",
"JSON to CSV",
"XML to JSON",
"XML to YAML",
"Text to XML",
"Text to YAML",
"Text to TOML",
"YAML to JSON",
"TOML to JSON",
"Text to CSV",
"YAML to XML",
"JSON to YAML",
"TOML to YAML",
"YAML to CSV",
"CSV to JSON",
"CSV to XML",
"Text to JSON",
"XML to CSV",
]
[docs]
class StructEval(BaseTask[str]):
"""StructEval task: https://tiger-ai-lab.github.io/StructEval/"""
NAME = "StructEval"
DATASET_PATH = "TIGER-Lab/StructEval"
SAMPLE_SPLIT = "train"
FEWSHOT_SPLIT = "train" # Only has train split
RESPONSE_TYPE = ResponseType.COMPLETION
METRICS = [StructMetric] # Define appropriate metrics for StructEval
SUBJECTS = StructEvalSubjects
LANGUAGE = Language.ENG
HF_REVISION = "b551217560cf225245b0607a21c505e24a58e396"
def __init__(self, num_fewshot: int = 0) -> None:
if num_fewshot > 0:
raise ValueError("StructEval only supports zero-shot evaluation.")
super().__init__(num_fewshot)
def _load_dataset(self, subject: str) -> None:
hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
assert isinstance(hf_dataset, DatasetDict), "Expected a Hugging Face Dataset object."
hf_dataset = hf_dataset.filter(lambda item: item["task_name"] == subject, num_proc=os.cpu_count())
self.dataset = {}
self.rnd = random.Random(RANDOM_SEED)
for split, data in hf_dataset.items():
if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
continue
data_list = list(data)
if split == self.SAMPLE_SPLIT:
self.rnd.shuffle(data_list)
self.dataset[split] = data_list
def _get_instruction_text(self, item: dict[str, Any]) -> str:
return (
f"{item['query']}\n\nIMPORTANT: Only output the required output format. "
"You must start the format/code with <|BEGIN_CODE|> and end the format/code with <|END_CODE|>. "
"No other text output (explanation, comments, etc.) are allowed. Do not use markdown code fences.\n"
)
def _get_context(self, item: dict[str, Any]) -> StructMetricContext | RenderableStructMetricContext:
return StructMetricContext(
output_type=item["output_type"],
paths=item["raw_output_metric"],
)
def _get_cue_text(self, item: dict[str, Any]) -> str:
return "<|BEGIN_CODE|>"
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
return None
[docs]
def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
m = re.search(r"(?:<\|BEGIN_CODE\|>|```[\w+-]*)(.*?)(?:<\|END_CODE\|>|```*)", completion_text, re.DOTALL)
return m.group(1).strip() if m else completion_text.strip()
# There are more subjects in the StructEval dataset, but currently only the HTML output metric is implemented.
RENDERABLE_STRUCTEVAL_SUBJECTS = [
"Convert Markdown to HTML",
"Convert React to HTML",
"Convert Vue to HTML",
"Text to HTML",
]
[docs]
class RenderableStructEval(StructEval):
"""Renderable StructEval task for tasks that can be rendered visually."""
NAME = "RenderableStructEval"
SUBJECTS = RENDERABLE_STRUCTEVAL_SUBJECTS
METRICS = [RenderableStructMetric] # Define appropriate metrics for StructEval
def _get_context(self, item: dict[str, Any]) -> RenderableStructMetricContext:
return RenderableStructMetricContext(
output_type=item["output_type"],
keywords=item["raw_output_metric"],
)