Source code for eval_framework.run

import argparse
import datetime
import logging
from pathlib import Path
from typing import Any

try:
    from eval_framework.context.determined import DeterminedContext
except ImportError:
    DeterminedContext = None  # type: ignore


from eval_framework.context.local import LocalContext
from eval_framework.main import main
from eval_framework.tasks.task_loader import load_extra_tasks
from eval_framework.utils.logging import setup_logging

logger = logging.getLogger(__name__)

CONTEXT = {
    "local": LocalContext,
    "determined": DeterminedContext,
}


[docs] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() parser.add_argument( "--context", type=str, required=False, default="local", choices=["local", "determined"], help="The context in which the evaluation is run.", ) parser.add_argument( "--models", type=Path, required=False, default=Path(__file__).parent / "llm" / "models.py", help="The path to the Python module file containing model classes.", ) parser.add_argument( "--extra-task-modules", nargs="*", default=[], required=False, help="List of files and folders containing additional task definitions.", ) parser.add_argument( "--llm-name", type=str, required=False, help=( "Either a full import path for a model (e.g., `eval_framework.llm.huggingface.HFLLM`) or the " "name of a class derived from `eval_framework.llm.base.BaseLLM` that can be found in the " "models file. The resulting model is instantiated with the arguments provided via `--llm-args`." ), ) parser.add_argument( "--llm-args", type=str, nargs="+", default=(), required=False, help="The arguments to pass to the LLM as key=value pairs.", ) parser.add_argument( "--num-samples", type=int, required=False, help="The number of samples per subject to evaluate." ) parser.add_argument( "--max-tokens", type=int, required=False, help="The maximum number of tokens to generate for each sample. Overwrites any task default value.", ) parser.add_argument( "--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use." ) parser.add_argument( "--repeats", type=int, required=False, default=1, help="The number of times to repeat each sample in the evaluation.", ) parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.") parser.add_argument( "--randomize-judge-order", action="store_true", help="Randomize the order of answers presented to the LLM judge to mitigate position bias.", ) # Perturbation arguments parser.add_argument( "--perturbation-type", type=str, required=False, choices=[ "editor", "permute", "replace", "delete", "uppercase", ], help=( "The type of perturbation to apply. Note that this may not make sense for some prompts, for example those " "containing math and code." ), ) parser.add_argument( "--perturbation-probability", type=float, required=False, default=None, help="The probability of applying a perturbation to each word or character (between 0.0 and 1.0).", ) parser.add_argument( "--perturbation-seed", type=int, required=False, default=42, help="Random seed controlling perturbations.", ) parser.add_argument( "--task-subjects", type=str, nargs="+", required=False, help=( "The subjects of the task to evaluate. If empty, all subjects are evaluated. Subjects in the form of " "tuples can be specified in a comma-delimited way, possibly using wildcard * in some dimensions of a " "tuple, e.g., 'DE_DE, *' or 'FR_FR, astronomy'." ), ) parser.add_argument( "--hf-revision", type=str, required=False, default=None, help="A tag name, a branch name, or commit hash for the task HF dataset.", ) parser.add_argument( "--judge-models", type=Path, required=False, help="The path to the Python module file containing LLM judge model classes.", ) parser.add_argument( "--output-dir", type=Path, default="outputs", required=False, help="The path for the evaluation outputs.", ) parser.add_argument( "--hf-upload-repo", type=str, default="Aleph-Alpha/evaluation-results", required=False, help="Customizable path for the HuggingFace git repository where runs will be stored.", ) parser.add_argument( "--hf-upload-dir", type=str, default="", required=False, help="Folder name for the HuggingFace git repository where runs will be stored.", ) parser.add_argument( "--wandb-project", type=str, default=None, required=False, help=( "The name of the Weights & Biases project to log runs to. " "The environment variable WANDB_API_KEY must be set." ), ) parser.add_argument( "--wandb-entity", type=str, default=None, required=False, help="The name of the Weights & Biases entity to log runs to. Defaults to the user's default entity.", ) parser.add_argument( "--wandb-run-id", type=str, default=None, required=False, help=( "The ID of an existing Weights & Biases run to resume. " "If not given, creates a new run. If given and exists, " "will continue the run but will overwrite the Python command logged in wandb." ), ) parser.add_argument( "--wandb-upload-results", action=argparse.BooleanOptionalAction, required=False, default=True, help=("Whether to upload results as an artifact to Weights & Biases (default: True). Needs `--wandb-project`."), ) parser.add_argument( "--description", type=str, required=False, help="Description of the run. This will be added to the metadata of the run to help with bookkeeping.", ) parser.add_argument( "--batch-size", type=int, default=1, required=False, help=( "Size of batch of samples to send to the LLM for evaluation in parallel. " "Use 1 for sequential running (default)." ), ) parser.add_argument( "--save-logs", action="store_true", default=True, required=False, help="Whether to save logs to a file in the output directory (default: True).", ) parser.add_argument( "--judge-model-name", type=str, required=False, help=( "Either a full import path for a judge (e.g., `eval_framework.llm.huggingface.HFLLM`) or the " "name of a class derived from `eval_framework.llm.base.BaseLLM` that can be found in the " "models file. The resulting judge model is instantiated with the arguments provided via " "`--judge-model-args`." ), ) parser.add_argument( "--judge-model-args", type=str, required=False, nargs="+", default=(), help="The args of the judge model used.", ) parser.add_argument( "--resource-cleanup", action="store_true", required=False, default=False, help=("Add this flag to free up GPU resources between response generation and evaluation"), ) parser.add_argument( "--delete-output-dir-after-upload", action="store_true", required=False, default=False, help=("Add this flag to remove the output directory after a successful upload to HF or WandB."), ) parser.add_argument( "-v", "--verbosity", type=int, nargs="?", default=1, choices=[0, 1, 2], help="Set the logging verbosity level: 0=critical, 1=info, 2=debug", ) llm_args: dict[str, Any] = {} args = parser.parse_args() for arg in args.llm_args: if "=" in arg: key, value = arg.split("=", 1) # Handle nested keys like "sampling_params.temperature=0.7" if "." in key: nested_key, sub_key = key.split(".", 1) if nested_key not in llm_args: llm_args[nested_key] = {} llm_args[nested_key][sub_key] = value else: llm_args[key] = value args.llm_args = llm_args judge_model_args = {} for arg in args.judge_model_args: if "=" in arg: key, value = arg.split("=", 1) judge_model_args[key] = value args.judge_model_args = judge_model_args # if args.extra_task_modules: # # Convert the comma-separated string into a list # args.extra_task_modules = [file_or_dir.strip() for file_or_dir in args.extra_task_modules.split(",")] # else: # args.extra_task_modules = None return args
[docs] def run_with_kwargs(kwargs: dict) -> None: # Setup logging for the output directory output_dir = kwargs.get("output_dir", "results") log_level = kwargs.get("verbosity", 1) setup_logging(output_dir, log_level=log_level) logger.info(kwargs) now = datetime.datetime.now() logger.info(f"starting time: {now}") if kwargs.get("extra_task_modules"): load_extra_tasks(kwargs["extra_task_modules"]) context_name = kwargs.pop("context") context = CONTEXT[context_name]( llm_name=kwargs["llm_name"], models_path=kwargs["models"], num_samples=kwargs["num_samples"], max_tokens=kwargs["max_tokens"], num_fewshot=kwargs["num_fewshot"], repeats=kwargs["repeats"], task_name=kwargs["task_name"], task_subjects=kwargs["task_subjects"], hf_revision=kwargs["hf_revision"], output_dir=kwargs["output_dir"], wandb_project=kwargs["wandb_project"], wandb_entity=kwargs["wandb_entity"], wandb_run_id=kwargs["wandb_run_id"], wandb_upload_results=kwargs["wandb_upload_results"], hf_upload_dir=kwargs["hf_upload_dir"], hf_upload_repo=kwargs["hf_upload_repo"], llm_args=kwargs["llm_args"], judge_models_path=kwargs["judge_models"], judge_model_name=kwargs["judge_model_name"], judge_model_args=kwargs["judge_model_args"], batch_size=kwargs["batch_size"], description=kwargs["description"], perturbation_type=kwargs["perturbation_type"], perturbation_probability=kwargs["perturbation_probability"], perturbation_seed=kwargs["perturbation_seed"], randomize_judge_order=kwargs["randomize_judge_order"], delete_output_dir_after_upload=kwargs["delete_output_dir_after_upload"], # save_logs=kwargs["save_logs"], ) with context as ctx: if ctx.config is None: raise ValueError(f"Context configuration is not set for '{type(ctx)}'.") main( llm=ctx.config.llm_class(**ctx.config.llm_args), config=ctx.config, should_preempt_callable=ctx.should_preempt, trial_id=ctx.get_trial_id(), resource_cleanup=kwargs.pop("resource_cleanup", False), verbosity=log_level, ) logger.info(f"time since start: {datetime.datetime.now() - now}")
[docs] def run() -> None: run_with_kwargs(vars(parse_args()))
# Enable execution via `python -m eval_framework.run`. Useful for # debugging via `debugpy -m eval_framework.run` if __name__ == "__main__": run()