import argparse
import datetime
import logging
from pathlib import Path
from typing import Any
try:
from eval_framework.context.determined import DeterminedContext
except ImportError:
DeterminedContext = None # type: ignore
from eval_framework.context.local import LocalContext
from eval_framework.main import main
from eval_framework.tasks.task_loader import load_extra_tasks
from eval_framework.utils.logging import setup_logging
logger = logging.getLogger(__name__)
CONTEXT = {
"local": LocalContext,
"determined": DeterminedContext,
}
[docs]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument(
"--context",
type=str,
required=False,
default="local",
choices=["local", "determined"],
help="The context in which the evaluation is run.",
)
parser.add_argument(
"--models",
type=Path,
required=False,
default=Path(__file__).parent / "llm" / "models.py",
help="The path to the Python module file containing model classes.",
)
parser.add_argument(
"--extra-task-modules",
nargs="*",
default=[],
required=False,
help="List of files and folders containing additional task definitions.",
)
parser.add_argument(
"--llm-name",
type=str,
required=False,
help=(
"Either a full import path for a model (e.g., `eval_framework.llm.huggingface.HFLLM`) or the "
"name of a class derived from `eval_framework.llm.base.BaseLLM` that can be found in the "
"models file. The resulting model is instantiated with the arguments provided via `--llm-args`."
),
)
parser.add_argument(
"--llm-args",
type=str,
nargs="+",
default=(),
required=False,
help="The arguments to pass to the LLM as key=value pairs.",
)
parser.add_argument(
"--num-samples", type=int, required=False, help="The number of samples per subject to evaluate."
)
parser.add_argument(
"--max-tokens",
type=int,
required=False,
help="The maximum number of tokens to generate for each sample. Overwrites any task default value.",
)
parser.add_argument(
"--num-fewshot", type=int, required=False, default=0, help="The number of fewshot examples to use."
)
parser.add_argument(
"--repeats",
type=int,
required=False,
default=1,
help="The number of times to repeat each sample in the evaluation.",
)
parser.add_argument("--task-name", type=str, required=False, help="The name of the task to evaluate.")
parser.add_argument(
"--randomize-judge-order",
action="store_true",
help="Randomize the order of answers presented to the LLM judge to mitigate position bias.",
)
# Perturbation arguments
parser.add_argument(
"--perturbation-type",
type=str,
required=False,
choices=[
"editor",
"permute",
"replace",
"delete",
"uppercase",
],
help=(
"The type of perturbation to apply. Note that this may not make sense for some prompts, for example those "
"containing math and code."
),
)
parser.add_argument(
"--perturbation-probability",
type=float,
required=False,
default=None,
help="The probability of applying a perturbation to each word or character (between 0.0 and 1.0).",
)
parser.add_argument(
"--perturbation-seed",
type=int,
required=False,
default=42,
help="Random seed controlling perturbations.",
)
parser.add_argument(
"--task-subjects",
type=str,
nargs="+",
required=False,
help=(
"The subjects of the task to evaluate. If empty, all subjects are evaluated. Subjects in the form of "
"tuples can be specified in a comma-delimited way, possibly using wildcard * in some dimensions of a "
"tuple, e.g., 'DE_DE, *' or 'FR_FR, astronomy'."
),
)
parser.add_argument(
"--hf-revision",
type=str,
required=False,
default=None,
help="A tag name, a branch name, or commit hash for the task HF dataset.",
)
parser.add_argument(
"--judge-models",
type=Path,
required=False,
help="The path to the Python module file containing LLM judge model classes.",
)
parser.add_argument(
"--output-dir",
type=Path,
default="outputs",
required=False,
help="The path for the evaluation outputs.",
)
parser.add_argument(
"--hf-upload-repo",
type=str,
default="Aleph-Alpha/evaluation-results",
required=False,
help="Customizable path for the HuggingFace git repository where runs will be stored.",
)
parser.add_argument(
"--hf-upload-dir",
type=str,
default="",
required=False,
help="Folder name for the HuggingFace git repository where runs will be stored.",
)
parser.add_argument(
"--wandb-project",
type=str,
default=None,
required=False,
help=(
"The name of the Weights & Biases project to log runs to. "
"The environment variable WANDB_API_KEY must be set."
),
)
parser.add_argument(
"--wandb-entity",
type=str,
default=None,
required=False,
help="The name of the Weights & Biases entity to log runs to. Defaults to the user's default entity.",
)
parser.add_argument(
"--wandb-run-id",
type=str,
default=None,
required=False,
help=(
"The ID of an existing Weights & Biases run to resume. "
"If not given, creates a new run. If given and exists, "
"will continue the run but will overwrite the Python command logged in wandb."
),
)
parser.add_argument(
"--wandb-upload-results",
action=argparse.BooleanOptionalAction,
required=False,
default=True,
help=("Whether to upload results as an artifact to Weights & Biases (default: True). Needs `--wandb-project`."),
)
parser.add_argument(
"--description",
type=str,
required=False,
help="Description of the run. This will be added to the metadata of the run to help with bookkeeping.",
)
parser.add_argument(
"--batch-size",
type=int,
default=1,
required=False,
help=(
"Size of batch of samples to send to the LLM for evaluation in parallel. "
"Use 1 for sequential running (default)."
),
)
parser.add_argument(
"--save-logs",
action="store_true",
default=True,
required=False,
help="Whether to save logs to a file in the output directory (default: True).",
)
parser.add_argument(
"--judge-model-name",
type=str,
required=False,
help=(
"Either a full import path for a judge (e.g., `eval_framework.llm.huggingface.HFLLM`) or the "
"name of a class derived from `eval_framework.llm.base.BaseLLM` that can be found in the "
"models file. The resulting judge model is instantiated with the arguments provided via "
"`--judge-model-args`."
),
)
parser.add_argument(
"--judge-model-args",
type=str,
required=False,
nargs="+",
default=(),
help="The args of the judge model used.",
)
parser.add_argument(
"--resource-cleanup",
action="store_true",
required=False,
default=False,
help=("Add this flag to free up GPU resources between response generation and evaluation"),
)
parser.add_argument(
"--delete-output-dir-after-upload",
action="store_true",
required=False,
default=False,
help=("Add this flag to remove the output directory after a successful upload to HF or WandB."),
)
parser.add_argument(
"-v",
"--verbosity",
type=int,
nargs="?",
default=1,
choices=[0, 1, 2],
help="Set the logging verbosity level: 0=critical, 1=info, 2=debug",
)
llm_args: dict[str, Any] = {}
args = parser.parse_args()
for arg in args.llm_args:
if "=" in arg:
key, value = arg.split("=", 1)
# Handle nested keys like "sampling_params.temperature=0.7"
if "." in key:
nested_key, sub_key = key.split(".", 1)
if nested_key not in llm_args:
llm_args[nested_key] = {}
llm_args[nested_key][sub_key] = value
else:
llm_args[key] = value
args.llm_args = llm_args
judge_model_args = {}
for arg in args.judge_model_args:
if "=" in arg:
key, value = arg.split("=", 1)
judge_model_args[key] = value
args.judge_model_args = judge_model_args
# if args.extra_task_modules:
# # Convert the comma-separated string into a list
# args.extra_task_modules = [file_or_dir.strip() for file_or_dir in args.extra_task_modules.split(",")]
# else:
# args.extra_task_modules = None
return args
[docs]
def run_with_kwargs(kwargs: dict) -> None:
# Setup logging for the output directory
output_dir = kwargs.get("output_dir", "results")
log_level = kwargs.get("verbosity", 1)
setup_logging(output_dir, log_level=log_level)
logger.info(kwargs)
now = datetime.datetime.now()
logger.info(f"starting time: {now}")
if kwargs.get("extra_task_modules"):
load_extra_tasks(kwargs["extra_task_modules"])
context_name = kwargs.pop("context")
context = CONTEXT[context_name](
llm_name=kwargs["llm_name"],
models_path=kwargs["models"],
num_samples=kwargs["num_samples"],
max_tokens=kwargs["max_tokens"],
num_fewshot=kwargs["num_fewshot"],
repeats=kwargs["repeats"],
task_name=kwargs["task_name"],
task_subjects=kwargs["task_subjects"],
hf_revision=kwargs["hf_revision"],
output_dir=kwargs["output_dir"],
wandb_project=kwargs["wandb_project"],
wandb_entity=kwargs["wandb_entity"],
wandb_run_id=kwargs["wandb_run_id"],
wandb_upload_results=kwargs["wandb_upload_results"],
hf_upload_dir=kwargs["hf_upload_dir"],
hf_upload_repo=kwargs["hf_upload_repo"],
llm_args=kwargs["llm_args"],
judge_models_path=kwargs["judge_models"],
judge_model_name=kwargs["judge_model_name"],
judge_model_args=kwargs["judge_model_args"],
batch_size=kwargs["batch_size"],
description=kwargs["description"],
perturbation_type=kwargs["perturbation_type"],
perturbation_probability=kwargs["perturbation_probability"],
perturbation_seed=kwargs["perturbation_seed"],
randomize_judge_order=kwargs["randomize_judge_order"],
delete_output_dir_after_upload=kwargs["delete_output_dir_after_upload"],
# save_logs=kwargs["save_logs"],
)
with context as ctx:
if ctx.config is None:
raise ValueError(f"Context configuration is not set for '{type(ctx)}'.")
main(
llm=ctx.config.llm_class(**ctx.config.llm_args),
config=ctx.config,
should_preempt_callable=ctx.should_preempt,
trial_id=ctx.get_trial_id(),
resource_cleanup=kwargs.pop("resource_cleanup", False),
verbosity=log_level,
)
logger.info(f"time since start: {datetime.datetime.now() - now}")
[docs]
def run() -> None:
run_with_kwargs(vars(parse_args()))
# Enable execution via `python -m eval_framework.run`. Useful for
# debugging via `debugpy -m eval_framework.run`
if __name__ == "__main__":
run()