Source code for eval_framework.tasks.benchmarks.tablebench

import csv
import json
import random
import re
import tempfile
from itertools import product
from typing import Any

from eval_framework.exceptions import LogicError
from eval_framework.metrics.completion.rouge_l import ROUGE_L
from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
from eval_framework.tasks.utils import run_python_code
from template_formatting.formatter import Role

TABLE_BENCH_SUBJECTS = [
    "NumericalReasoning",
    "DataAnalysis",
    "FactChecking",
    # "Visualization" task is complex to re-implement, of small relevance and of small size (5.6% of dataset, Language)
    # see https://github.com/TableBench/TableBench/blob/main/eval/batch_parse_response_script.py#L56
]

TABLE_BENCH_INSTRUCTION_TYPES = [
    # "DP",  # Direct Prompting, has been deleted: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench-Instructions/commit/534a6d859494c370f2aa6ee0e6076103d9707560 # noqa: E501
    "PoT",  # Program-of-thought
    "SCoT",  # Symbolic chain-of-thought
    "TCoT",  # Textual chain-of-thought
]



[docs]
class TableBench(BaseTask[tuple[str, str]]):
    """TableBench dataset: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench"""

    NAME = "TableBench"
    DATASET_PATH = "Multilingual-Multimodal-NLP/TableBench"
    HF_REVISION = "81b551c744b7f49cfa0ad69cb7a1465d865c206e"  # latest version of the dataset is corrupted
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "test"  # (there is no dedicated split, few-shot is not expected for this dataset)
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [ROUGE_L]
    SUBJECTS = list(product(TABLE_BENCH_INSTRUCTION_TYPES, TABLE_BENCH_SUBJECTS))
    LANGUAGE = Language.ENG

    def __init__(self, num_fewshot: int = 0) -> None:
        assert num_fewshot == 0, "Fewshot is not supported for TableBench"
        super().__init__(num_fewshot)

    def _load_dataset(self, subject: tuple[str, str]) -> None:
        instruction_type, qtype = subject
        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
        self.dataset = {}

        self.rnd = random.Random(RANDOM_SEED)

        for split, data in hf_dataset.items():
            data = data.filter(lambda x: x["qtype"] == qtype and x["instruction_type"] == instruction_type)
            data_list = list(data)

            if split == self.SAMPLE_SPLIT:
                self.rnd.shuffle(data_list)

            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
                self.dataset[split] = data_list

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return item["instruction"]

    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
        return item["answer"]


[docs]
    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        assert sample is not None
        if "PoT" in sample.subject:
            # Extract the (last) generated code snippet or fail otherwise
            try:
                matches = re.findall(r"```python\n(.*?)```", completion_text, flags=re.S)
                if not matches:
                    return ""
                code = matches[-1]
            except Exception:
                return ""

            # Extract the table given in the prompt and prepare it as a file
            instruction = [m.content for m in sample.messages if m.role == Role.USER][-1]
            tables = re.findall(r"\[TABLE\] (.*?) Let's get start!", instruction, flags=re.S)
            if not tables:
                return ""

            # Check if the tables is a list or a string
            if isinstance(tables, str):
                table_dict = json.loads(tables.strip())
            elif isinstance(tables, list):
                table_dict = json.loads(tables[0].strip())
            else:
                raise LogicError(f"TableBench: {instruction} does not seem to contain one table.")

            with tempfile.TemporaryDirectory() as tmpdirname:
                filename = f"{tmpdirname}/table.csv"
                with open(filename, "w") as f:
                    writer = csv.writer(f)
                    writer.writerow(table_dict["columns"])
                    writer.writerows(table_dict["data"])

                # Run the code in a Docker image, providing the table from the prompt
                completion_text = run_python_code(
                    code, image="amancevice/pandas:slim", input_files=[(filename, "/var/lib/pandas/table.csv")]
                )

                if "Error" in completion_text:
                    return ""

        # Extract the answer, be it directly from the model or be it the result of the generated code
        try:
            match = re.search(r"Final Answer: (.+)", completion_text)
            return match.group(1).strip() if match else ""
        except Exception:
            return ""