Source code for eval_framework.tasks.benchmarks.math_reasoning

import logging
import random
import re
from typing import Any

from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
from eval_framework.metrics.completion.language_checker import LanguageRawConsistencyChecker
from eval_framework.metrics.completion.math_minerva_completion import (
    MathMinervaCompletion,
    MathMinervaCompletionRelaxed,
)
from eval_framework.metrics.completion.math_reasoning_completion import MathReasoningCompletion
from eval_framework.metrics.completion.minerva_math_utils import (
    extract_answers,
    normalized_gold_from_solution,
)
from eval_framework.metrics.loglikelihood.bits_per_byte import BitsPerByteLoglikelihood
from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType

# Hendrycks MATH subject splits (shared by MATH, MATHMinervaEvalHarness, MATHMinervaBPB)
MATH_SUBJECTS = [
    "algebra",
    "counting_and_probability",
    "geometry",
    "intermediate_algebra",
    "number_theory",
    "prealgebra",
    "precalculus",
]
logger = logging.getLogger(__name__)


[docs] class MATHReasoning(BaseTask[str]): """AIME 2024 dataset: https://huggingface.co/datasets/HuggingFaceH4/aime_2024 This dataset contains a single train split of 30 questions. Data contains ID | Problem | Solution | Answer pass@1 evaluation """ RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [MathReasoningCompletion] SUBJECTS = [NO_SUBJECT] ANSWER_PATTERN = r"(?i)Answer\s*:\s*(.*)" LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) # Max tokens are going to be determined by the model. # however GPT paper and results used 1024 tokens, s1 used 2048 def _extract_answer( self, string: str, extract_from_boxed: bool = True, extract_regex: str = ANSWER_PATTERN ) -> str | None: """Extract Answer String from \\boxed expression or based on regex""" if not extract_from_boxed: match = re.search(extract_regex, string) if match: return match.group(1) return None if "\\boxed" not in string and "\\fbox" not in string: return None idx_boxed = string.rfind("\\boxed") idx_fbox = string.rfind("\\fbox") idx = max(idx_boxed, idx_fbox) i = idx right_brace_idx = None num_left_braces_open = 0 while i < len(string): if string[i] == "{": num_left_braces_open += 1 elif string[i] == "}": num_left_braces_open -= 1 if num_left_braces_open == 0: right_brace_idx = i break i += 1 if right_brace_idx is None: retval = None else: retval = string[idx : right_brace_idx + 1] if retval: left = "\\boxed{" try: assert retval[: len(left)] == left assert retval[-1] == "}" return retval[len(left) : -1] except AssertionError: return None return None
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: assert isinstance(completion_text, str) extracted_answer = self._extract_answer(completion_text) if extracted_answer is None: normalized_answer = "[no_answer]" else: normalized_answer = self._strip_string(extracted_answer) return normalized_answer
def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: raise NotImplementedError("This method should be implemented in subclasses") # The following code is coming from the Eleuther AI lm-evaluation-harness repository # Subject to MIT License # This needs a major refactoring but is kept as is for consistency with the original code def _find_closing_bracket(self, string: str, start_index: int) -> int: """ Finds the index of the closing '}' for a '{' at the given start index. :param string: The input string containing '{' and '}' brackets. :param start_index: The index where the opening '{' is located. :return: The index of the corresponding closing '}' or -1 if not found. """ if start_index < 0 or start_index >= len(string) or string[start_index] != "{": raise ValueError("The start_index must point to a '{' character.") depth = 0 # Track the nesting level of brackets for i in range(start_index, len(string)): if string[i] == "{": depth += 1 # Increase depth for each opening bracket elif string[i] == "}": depth -= 1 # Decrease depth for each closing bracket if depth == 0: return i # Found the matching closing bracket return -1 # No matching '}' found def _split_text_command(self, string: str, search: str = r"\text{") -> tuple[str, str, str]: """ Extracts the content inside a LaTeX \text{...} command and returns three parts: 1. Everything before `\text{` 2. The content inside `\text{...}` 3. Everything after the closing `}` :param string: The input LaTeX string. :param search: The command to search for (default: `\text{`). :return: Tuple (before_text, inside_text, after_text). If no `\text{}` is found, returns (string, "", ""). If no closing bracket `}` is found, returns (before_text, remaining_string, ""). """ search_len = len(search) search_start = string.find(search) # If \text{ is not found, return the entire string in `before_text` if search_start == -1: return string, "", "" # Ensure `{` follows the search term content_start = search_start + search_len - 1 if content_start >= len(string) or string[content_start] != "{": return string, "", "" # Find the corresponding closing bracket closing_index = self._find_closing_bracket(string, start_index=content_start) # If no closing bracket is found, return remaining string as "inside_text" if closing_index == -1: return string[:search_start], string[content_start + 1 :], "" before_text = string[:search_start] # Everything before `\text{` inside_text = string[content_start + 1 : closing_index] # Content inside `\text{...}` after_text = string[closing_index + 1 :] # Everything after the closing `}` return before_text, inside_text, after_text # https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py#L144 def _remove_right_units(self, string: str) -> str: # "\text{ " only ever occurs (at least in the val set) when describing units count = string.count(r"\text{") if count == 0: return string elif count > 1: content, *_ = string.split(r"\text{", maxsplit=1) return content elif count == 1: before, inside, after = self._split_text_command(string) if before.strip(): return before.strip() elif after.strip(): return after.strip() else: return inside.strip() else: raise ValueError("Unexpected count of units in string") # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py#L154 def _fix_sqrt(self, string: str) -> str: if "\\sqrt" not in string: return string parts = string.split("\\sqrt") new_string = parts[0] for part in parts[1:]: new_string += "\\sqrt{" if part[0] != "{": new_string += part[0] + "}" new_string += part[1:] return new_string # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py#L97 def _fix_fracs(self, string: str) -> str: parts = string.split("\\frac") if len(parts) <= 1: return string new_str = parts[0] for part in parts[1:]: new_str += "\\frac" if not part: continue if part[0] == "{": new_str += part else: try: assert len(part) >= 2 except AssertionError: return string a = part[0] b = part[1] new_str += "{" + a + "}{" if b != "{": new_str += b + "}" if len(part) > 2: new_str += part[2:] return new_str # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py#L129 def _fix_a_slash_b(self, string: str) -> str: if len(string.split("/")) != 2: return string a, b = string.split("/") try: a_int = int(a) b_int = int(b) assert string == f"{a_int}/{b_int}" new_string = "\\frac{" + str(a_int) + "}{" + str(b_int) + "}" return new_string except AssertionError: return string except ValueError: return string # Based on https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py#L169 def _strip_string(self, string: str) -> str: replacements = [ (r"\n", ""), # linebreaks (r"\\!", ""), # remove inverse spaces (r"\\\\", "\\"), # replace \\ with \ (r"tfrac", "frac"), # replace tfrac with frac (r"dfrac", "frac"), # replace dfrac with frac (r"\\left", ""), # remove \left (r"\\right", ""), # remove \right (r"^{\\circ}", ""), # remove circ (r"^\\circ", ""), # remove circ (r"\\$", ""), # remove $ ] for pattern, replacement in replacements: string = string.replace(pattern, replacement) # remove units (on the right) string = self._remove_right_units(string) replacements = [ (r"\\%", ""), # remove percentage (r"\%", ""), # noqa: W605 # remove percentage (r" .", " 0."), # " 0." equivalent to " ." (r"{.", "{0."), # "{0." equivalent to "{." ] for pattern, replacement in replacements: string = string.replace(pattern, replacement) # if empty, return empty string if len(string) == 0: return string # Add "0" if "." is the start of the string if string[0] == ".": string = "0" + string # Get rid of e.g. "k = " or "x = y = " at beginning parts = [s.strip() for s in string.split("=")] if len(parts) == 2 and len(parts[0]) <= 2: string = parts[1] elif len(parts) > 2: if all(len(part) <= 2 and re.match(r"^[a-zA-Z]\w*$", part) for part in parts[:-1]): # noqa: W605 string = parts[-1] # fix sqrt3 --> sqrt{3} string = self._fix_sqrt(string) # remove spaces string = string.replace(r" ", "") # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, # etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} string = self._fix_fracs(string) # manually change 0.5 --> \frac{1}{2} if string == "0.5": string = "\\frac{1}{2}" # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y string = self._fix_a_slash_b(string) def strip_leading_zero(s: str) -> str: """strip leading zeros, but keep the first zero if it is a decimal""" return re.sub(r"\b0(?=\d)", "", s) # remove leading zeros string = strip_leading_zero(string) return string
[docs] class AIME2024(MATHReasoning): """AIME 2024 dataset: https://huggingface.co/datasets/HuggingFaceH4/aime_2024 This dataset contains a single train split of 30 questions. Data contains ID | Problem | Solution | Answer pass@1 evaluation """ NAME = "AIME2024" DATASET_PATH = "HuggingFaceH4/aime_2024" SAMPLE_SPLIT = "train" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [MathReasoningCompletion, LanguageRawConsistencyChecker] SUBJECTS = [NO_SUBJECT] LANGUAGE = Language.ENG # https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/prompt/config/llama3-instruct/math.yaml QUERY_TEMPLATE = """Solve the following math problem efficiently and clearly: - For simple problems (2 steps or fewer): Provide a concise solution with minimal explanation. - For complex problems (3 steps or more): Use this step-by-step format: ## Step 1: [Concise description] [Brief explanation and calculations] ## Step 2: [Concise description] [Brief explanation and calculations] ... Regardless of the approach, always conclude with: Therefore, the final answer is: $\\boxed{{answer}}$. I hope it is correct. Where [answer] is just the final number or expression that solves the problem. Problem: {Question}""" # noqa: E501 ANSWER_PATTERN = r"Therefore, the final answer is:(.*?). I hope it is correct." def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "AIME evaluation does not include few shot" super().__init__(num_fewshot) def _get_instruction_text(self, item: dict[str, Any]) -> str: return self.QUERY_TEMPLATE.format(Question=item["problem"]) def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return item["answer"].lstrip("0") # valid answers in this dataset range from 0-999 and have leading zeros
[docs] class AIME2025(AIME2024): """AIME 2025 dataset: https://huggingface.co/datasets/math-ai/aime25 This dataset contains a single test split of 30 questions. Data contains problem | answer | id pass@1 evaluation """ NAME = "AIME2025" DATASET_PATH = "math-ai/aime25" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "test" def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return item["answer"]
[docs] class AIME2026(AIME2024): """AIME 2026 dataset: https://huggingface.co/datasets/math-ai/aime26 This dataset contains a single test split of 30 questions. Data contains problem | answer | id pass@1 evaluation """ NAME = "AIME2026" DATASET_PATH = "math-ai/aime26" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "test" def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return item["answer"]
[docs] class MATH500(MATHReasoning): """MATH500 dataset: https://huggingface.co/datasets/HuggingFaceH4/MATH-500 This dataset contains a single test split of 500 questions. Data contains ID | Problem | Solution | Answer pass@1 evaluation """ NAME = "MATH500" DATASET_PATH = "HuggingFaceH4/MATH-500" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "test" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [MathReasoningCompletion, LanguageRawConsistencyChecker] SUBJECTS = [NO_SUBJECT] LANGUAGE = Language.ENG # Adapted from OpenAI's math_eval.py (c) 2024 OpenAI – MIT License – https://github.com/openai/simple-evals/blob/main/math_eval.py QUERY_TEMPLATE = """ Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem. {Question} Remember to put your answer in $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem. """.strip() # noqa: E501 def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "MATH-500 evaluation does not include few shot" super().__init__(num_fewshot)
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: extracted_answer_boxed = self._extract_answer(completion_text) extracted_answer_unboxed = self._extract_answer( completion_text, extract_from_boxed=False, extract_regex=self.ANSWER_PATTERN ) # if there is no "boxed" answer but there is an "Answer: " answer, use the latter extracted_answer = extracted_answer_boxed if extracted_answer_boxed is not None else extracted_answer_unboxed if extracted_answer is None: normalized_answer = "[no_answer]" else: normalized_answer = self._strip_string(extracted_answer) return normalized_answer
def _get_instruction_text(self, item: dict[str, Any]) -> str: return self.QUERY_TEMPLATE.format(Question=item["problem"]) def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return item["answer"]
[docs] class MATH(MATHReasoning): """MATH dataset: https://huggingface.co/datasets/EleutherAI/hendrycks_math""" NAME = "Math" DATASET_PATH = "EleutherAI/hendrycks_math" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [MathReasoningCompletion, LanguageRawConsistencyChecker] SUBJECTS = MATH_SUBJECTS LANGUAGE = Language.ENG # Adapted from OpenAI's math_eval.py (c) 2024 OpenAI – MIT License – https://github.com/openai/simple-evals/blob/main/math_eval.py QUERY_TEMPLATE = """ Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem. {Question} Remember to put your answer in $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem. """.strip() # noqa: E501 def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.stop_sequences = ["\nProblem:", "\nProblem", "\n\nProblem:", "\n\nProblem"]
[docs] def extract_last_two_dollar_text(self, s: str) -> str: """ extract_last_two_dollar_text finds text between the last two dollar signs in a string :param s: the string to extract text from :returns: the extracted text """ finds = re.findall(r"\$(.*?)\$", s) match = "" if len(finds) == 0 else finds[-1] return match
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: """ post_process_generated_completion extracts via flex extraction/matching. if there is a boxed answer, then this gets used first if there is no boxed answer, and latex math symbols ("$") then this will be extracted and used if there is an answer text ("Answer:") then this will be used last """ extracted_answer_boxed = self._extract_answer(completion_text) extracted_answer_latex_math_symb = self._extract_answer(self.extract_last_two_dollar_text(completion_text)) extracted_answer_unboxed = self._extract_answer( completion_text, extract_from_boxed=False, extract_regex=self.ANSWER_PATTERN ) # if there is no "boxed" answer but there is an "Answer: " answer, use the latter if extracted_answer_boxed: normalized_answer = self._strip_string(extracted_answer_boxed) elif extracted_answer_latex_math_symb: normalized_answer = self._strip_string(extracted_answer_latex_math_symb) elif extracted_answer_unboxed: normalized_answer = self._strip_string(extracted_answer_unboxed) else: normalized_answer = "[no_answer]" return normalized_answer
def _get_instruction_text(self, item: dict[str, Any]) -> str: return self.QUERY_TEMPLATE.format(Question=item["problem"]) + "\n" def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: return f"Answer: {item['solution']}" def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return self._extract_answer(item["solution"])
[docs] class MATHMinervaEvalHarness(MATHReasoning): """ MATH with Minerva-style prompt and scoring (lm-evaluation-harness / oe_eval parity). Uses strict final-answer string matching: "Final Answer: The final answer is ... I hope it is correct." Prompt: "Problem:\\n" + problem + "\\n\\n" + "Solution:" Gold: normalized_gold_from_solution(solution) Metrics: Exact Match, Exact Match (Flex) via MathMinervaCompletion. """ NAME = "MATHMinervaEvalHarness" DATASET_PATH = "EleutherAI/hendrycks_math" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [MathMinervaCompletion] SUBJECTS = MATH_SUBJECTS LANGUAGE = Language.ENG def __init__(self, num_fewshot: int = 0) -> None: super().__init__(num_fewshot) self.stop_sequences = ["Problem:", "\n\n"] self.max_tokens = 1024 def _get_instruction_text(self, item: dict[str, Any]) -> str: return "Problem:\n" + item["problem"] + "\n\n" + "Solution:" def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return normalized_gold_from_solution(item["solution"]) def _get_fewshot_target_text(self, item: dict[str, Any]) -> str: return " " + item["solution"]
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: """Primary answer for storage; metric uses raw_completion for exact_match_flex (strict matching).""" candidates = extract_answers(completion_text, use_cot=True, cot_style="minerva", relaxed=False) return candidates[0] if candidates else "[no_answer]"
[docs] class MATHMinerva(MATHMinervaEvalHarness): """ MATH with Minerva-style prompt and relaxed final-answer string matching. Same as MATHMinervaEvalHarness but allows flexible whitespace and case for variations of "(The )Final Answer: The (final )answer is ...( I hope it is correct.)", where parentheses are optional. """ NAME = "MATHMinerva" METRICS = [MathMinervaCompletionRelaxed]
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: """Primary answer for storage; uses relaxed final-answer extraction.""" candidates = extract_answers(completion_text, use_cot=True, cot_style="minerva", relaxed=True) return candidates[0] if candidates else "[no_answer]"
[docs] class MATH500Minerva(MATHMinerva): """ MATH-500 with Minerva-style prompt and scoring (OLMES minerva_math_500 parity). Uses HuggingFaceH4/MATH-500 which has a single 'default' config (no subject splits). """ NAME = "MATH500Minerva" DATASET_PATH = "HuggingFaceH4/MATH-500" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "test" SUBJECTS = [NO_SUBJECT] def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "MATH500Minerva evaluation does not include few shot" super().__init__(num_fewshot)
[docs] class MATHMinervaBPB(MATHReasoning): """ MATH (Hendrycks) with Minerva-style prompt, evaluated via loglikelihood of the gold answer string (bits-per-byte). Same prompt as MATHMinerva; scores P(normalized_gold_answer | prompt). """ NAME = "MATHMinervaBPB" DATASET_PATH = "EleutherAI/hendrycks_math" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS METRICS = [BitsPerByteLoglikelihood] SUBJECTS = MATH_SUBJECTS LANGUAGE = Language.ENG def _get_instruction_text(self, item: dict[str, Any]) -> str: return "Problem:\n" + item["problem"] + "\n\n" + "Solution:" def _get_cue_text(self, item: dict[str, Any]) -> str: return "" def _get_ground_truth(self, item: dict[str, Any]) -> str | None: normalized = self._normalized_gold_from_solution(item["solution"]) if normalized is None: return None return " " + normalized def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None: normalized = self._normalized_gold_from_solution(item["solution"]) if normalized is None: return None return [" " + normalized] def _normalized_gold_from_solution(self, solution: str) -> str | None: return normalized_gold_from_solution(solution)
[docs] class MATHLvl5(MATH): NAME = "Math Lvl 5" def _load_dataset(self, subject: SubjectType) -> None: name = subject if subject != NO_SUBJECT else None hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name) self.dataset = {} self.rnd = random.Random(RANDOM_SEED) for split, data in hf_dataset.items(): data_list = list(data) if split == self.SAMPLE_SPLIT: self.rnd.shuffle(data_list) if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]: self.dataset[split] = [item for item in data_list if item["level"] == "Level 5"] def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]: return self._extract_answer(item["solution"])
[docs] class GSM8KReasoning(MATHReasoning): """GSM8K dataset with reasoning prompt: https://huggingface.co/datasets/openai/gsm8k Zero-shot reasoning version that expects answers in boxed format. """ NAME = "GSM8KReasoning" DATASET_PATH = "openai/gsm8k" SAMPLE_SPLIT = "test" FEWSHOT_SPLIT = "train" RESPONSE_TYPE = ResponseType.COMPLETION METRICS = [AccuracyCompletion, LanguageRawConsistencyChecker] SUBJECTS = ["main"] PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] LANGUAGE = Language.ENG # Reasoning prompt template that encourages step-by-step thinking with boxed answers QUERY_TEMPLATE = """\ Solve the following math problem step by step. Think through the problem carefully and show your reasoning. Please provide your answer in the format: $\\boxed{{answer}}$ where answer is the final numerical result. Question: {question} Answer:""" def __init__(self, num_fewshot: int = 0) -> None: assert num_fewshot == 0, "GSM8K Reasoning is designed for zero-shot evaluation only" super().__init__(num_fewshot) self.stop_sequences: list[str] = []
[docs] def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str: for stop_sequence in self.stop_sequences: if stop_sequence in completion_text: completion_text = completion_text.split(stop_sequence)[0] return self._extract_answer_with_fallback(completion_text)
def _extract_answer_fallback(self, completion: str) -> str: """Fallback answer extraction using #### pattern for compatibility""" ans_re = re.compile(r"#### (\-?[0-9\.\,]+)") match = ans_re.search(completion) if match: match_str = match.group(1).strip() match_str = match_str.replace(",", "") return match_str else: return "[invalid]" def _extract_answer_with_fallback(self, completion: str) -> str: """Extract answer from completion, trying boxed format first, then fallback""" # Try boxed format first boxed_answer = self._extract_answer(completion) if boxed_answer is not None: # Clean the answer by removing commas and whitespace cleaned_answer = boxed_answer.replace(",", "").strip() return cleaned_answer # Fallback to #### pattern return self._extract_answer_fallback(completion) def _get_instruction_text(self, item: dict[str, Any]) -> str: return self.QUERY_TEMPLATE.format(question=item["question"]) def _get_ground_truth(self, item: dict[str, Any]) -> str | None: return self._extract_answer_fallback(item["answer"])
_OLMES_FEWSHOTS = [ ## https://github.com/huggingface/lm-evaluation-harness/blob/add_leaderboard_tasks/lm_eval/tasks/leaderboard/math/utils.py { "problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}", "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so " "$x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$," " which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The " "final answer is $[2,5)$. I hope it is correct.", "few_shot": "1", }, { "problem": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} " "\\mathbf{B}).$", "solution": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B})" " = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.", "few_shot": "1", }, { "problem": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, " "how many times must Terrell lift them in order to lift the same total weight?", "solution": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ " "pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of " "$2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:\n\\" "begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer:" " The final answer is $16$. I hope it is correct.", "few_shot": "1", }, { "problem": "If the system of equations\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}\nhas a " "solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.", "solution": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$" "Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac" "{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.", "few_shot": "1", }, ]
[docs] class MATHMinerva_OLMES(MATHMinerva): NAME = "MATHMinerva_OLMES" METRICS = [MathMinervaCompletion, MathMinervaCompletionRelaxed] def __init__(self, num_fewshot: int = 4) -> None: if num_fewshot != 4: logger.warning("MATHMinerva_OLMES supports a fixed num_fewshot of 4.") super().__init__(num_fewshot=4) def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]: return _OLMES_FEWSHOTS[: self.num_fewshot]