Creating Completion Tasks¶

This guide shows you how to create completion tasks, benchmarks where the model generates text to complete a prompt (e.g., math problems, code generation, question answering).

Quick Start Template¶

from typing import Any
from eval_framework.tasks.base import BaseTask
from eval_framework.models.sample import ResponseType
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion


class YourCompletionTask(BaseTask[str]):
    # Required attributes
    NAME = "YourTaskName"
    DATASET_PATH = "your-dataset/path"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["default"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        """Generate the question/prompt for the model."""
        return f"Question: {item['question']}"

    def _get_ground_truth(self, item: dict[str, Any]) -> str:
        """Extract the correct answer from the dataset."""
        return item['answer']

Step-by-Step Implementation¶

1. Basic Setup¶

Start with the minimal structure:

from eval_framework.tasks.base import BaseTask
from eval_framework.models.sample import ResponseType
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion

class MathQATask(BaseTask[str]):
    NAME = "MathQA"
    DATASET_PATH = "math_qa_dataset"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["arithmetic"]

2. Implement Required Methods¶

_get_instruction_text()¶

This method formats the question from your dataset:

def _get_instruction_text(self, item: dict[str, Any]) -> str:
    """Convert dataset item to a question prompt."""
    # Example for math problems
    return f"Solve this math problem: {item['problem']}"

    # Example for code generation
    # return f"Complete this function:\n{item['function_signature']}"

    # Example for Q&A
    # return f"Q: {item['question']}\nA:"

_get_ground_truth()¶

This method extracts the correct answer:

def _get_ground_truth(self, item: dict[str, Any]) -> str:
    """Extract the correct answer from the dataset item."""
    # Simple case - direct answer
    return item['answer']

    # For numeric answers, you might want to normalize
    # return str(float(item['answer']))

    # For code, might return the complete function
    # return item['complete_code']

3. Common Completion Task Patterns¶

Pattern 1: Question Answering¶

class QATask(BaseTask[str]):
    NAME = "QA Task"
    DATASET_PATH = "qa_dataset"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["general"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Question: {item['question']}\nAnswer:"

    def _get_ground_truth(self, item: dict[str, Any]) -> str:
        return item['answer']

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        return "Answer:"  # Helps model start response correctly

Pattern 2: Math Problem Solving¶

class MathTask(BaseTask[str]):
    NAME = "Math Problems"
    DATASET_PATH = "math_dataset"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["algebra", "geometry"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Problem: {item['problem']}\nSolution:"

    def _get_ground_truth(self, item: dict[str, Any]) -> str:
        return item['solution']

    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        """Extract final numerical answer from solution."""
        import re
        # Look for "The answer is X" pattern
        match = re.search(r'The answer is (\d+(?:\.\d+)?)', completion_text)
        if match:
            return match.group(1)
        return completion_text.strip()

Pattern 3: Code Generation¶

from eval_framework.metrics.completion.code_execution_pass_at_one import CodeExecutionPassAtOne
from eval_framework.shared.types import BaseMetricContext

class CodeTaskMetricContext(BaseMetricContext):
    """Will be passed to the metric for this task."""
    test_cases: list
    entry_point: str

class CodeTask(BaseTask[str]):
    NAME = "Code Generation"
    DATASET_PATH = "code_dataset"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [CodeExecutionPassAtOne]
    SUBJECTS = ["python"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        return f"Complete this function:\n{item['prompt']}"

    def _get_ground_truth(self, item: dict[str, Any]) -> str:
        return item['canonical_solution']

    def _get_context(self, item: dict[str, Any]) -> CodeTaskMetricContext:
        """Provide test cases for code execution."""
        return CodeTaskMetricContext(
            test_cases=item['text_cases'],
            entry_point=item['entry_point'],
        )

4. Advanced Customization¶

System Prompts¶

Add context or instructions:

def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
    return "You are a helpful assistant. Answer questions accurately and concisely."

Few-shot Examples¶

Customize how examples are formatted:

def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
    """Format the answer for few-shot examples."""
    answer = self._get_ground_truth(item)
    return f"The answer is: {answer}"

Custom Sampling¶

Control how few-shot examples are selected:

def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
    """Sample examples similar to the current item."""
    # Default: random sampling
    examples = self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot)

    # Custom: sample by difficulty or topic
    # same_topic = [ex for ex in self.dataset[self.FEWSHOT_SPLIT]
    #               if ex['topic'] == item['topic']]
    # examples = self.rnd.sample(same_topic, min(self.num_fewshot, len(same_topic)))

    return examples

5. Metrics for Completion Tasks¶

Choose appropriate metrics based on your task type:

# Exact match accuracy
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion

# Text similarity metrics
from eval_framework.metrics.completion.rouge_1 import Rouge1
from eval_framework.metrics.completion.rouge_2 import Rouge2
from eval_framework.metrics.completion.rouge_l import RougeL
from eval_framework.metrics.completion.bleu import Bleu

# Math-specific metrics
from eval_framework.metrics.completion.math_reasoning_completion import MathReasoningCompletion

# Code execution metrics
from eval_framework.metrics.completion.code_execution_pass_at_one import CodeExecutionPassAtOne

# Format validation
from eval_framework.metrics.completion.json_format import JSONFormat
from eval_framework.metrics.completion.csv_format import CSVFormat

# Custom metrics using LLM judges
from eval_framework.metrics.llm.llm_judge_score import LLMJudgeScore

class YourTask(BaseTask[str]):
    # Choose metrics appropriate for your task
    METRICS = [AccuracyCompletion, Rouge1, MathReasoningCompletion]

Complete Example: Geography Quiz¶

from typing import Any
from eval_framework.tasks.base import BaseTask
from eval_framework.models.sample import ResponseType
from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion

class GeographyQuizTask(BaseTask[str]):
    NAME = "Geography Quiz"
    DATASET_PATH = "geography_quiz"
    SAMPLE_SPLIT = "test"
    FEWSHOT_SPLIT = "train"
    RESPONSE_TYPE = ResponseType.COMPLETION
    METRICS = [AccuracyCompletion]
    SUBJECTS = ["world_capitals", "countries"]

    def _get_instruction_text(self, item: dict[str, Any]) -> str:
        """Format geography question."""
        return f"Question: What is the capital of {item['country']}?"

    def _get_ground_truth(self, item: dict[str, Any]) -> str:
        """Extract the correct capital city."""
        return item['capital']

    def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
        """Provide context about the task."""
        return "Answer geography questions about world capitals."

    def _get_cue_text(self, item: dict[str, Any]) -> str:
        """Start model response with 'Answer:'"""
        return "Answer:"

    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
        """Clean up the generated answer."""
        # Remove common prefixes and clean whitespace
        cleaned = completion_text.strip()
        if cleaned.startswith("Answer:"):
            cleaned = cleaned[7:].strip()
        return cleaned

Testing Your Completion Task¶

All tasks automatically go through formatting tests to ensure proper prompt generation. However, if your benchmark has specific functionality that needs testing, create a dedicated test file.

Automatic Formatting Tests¶

All benchmarks are automatically tested for proper prompt formatting across different chat templates. No additional setup required.

Custom Task Tests (Optional)¶

If your benchmark has specific logic that needs testing, create a test file in tests/tasks/ to test it.