Source code for eval_framework.context.eval
import importlib.util
import inspect
import sys
from contextlib import AbstractContextManager
from os import PathLike
from pathlib import Path
from typing import Any
import eval_framework
from eval_framework.llm.base import BaseLLM
from eval_framework.tasks.eval_config import EvalConfig
from eval_framework.tasks.perturbation import PerturbationConfig
[docs]
def import_models(models_file: PathLike | str) -> dict[str, type[BaseLLM]]:
models_file = Path(models_file).resolve()
library_path = Path(eval_framework.__path__[0]).resolve()
# Imports from the eval_framework module need special care to avoid
# import issues
if models_file.is_relative_to(library_path):
relative_path = models_file.relative_to(library_path.parent)
module_name = ".".join(relative_path.with_suffix("").parts)
module = importlib.import_module(module_name)
else:
module_name = models_file.stem
spec = importlib.util.spec_from_file_location(module_name, str(models_file))
if spec is None:
raise ImportError(f"Could not load module '{models_file}'.")
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
if spec.loader is None:
raise ImportError(f"Could not load module '{models_file}'.")
spec.loader.exec_module(module)
subclasses = {}
for name, clazz in inspect.getmembers(module, inspect.isclass):
if issubclass(clazz, BaseLLM) and clazz is not BaseLLM:
subclasses[name] = clazz
return subclasses
[docs]
class EvalContext(AbstractContextManager):
def __init__(
self,
llm_name: str,
models_path: Path,
num_samples: int | None = None,
max_tokens: int | None = None,
num_fewshot: int | None = None,
task_name: str | None = None,
task_subjects: list[str] | None = None,
hf_revision: str | None = None,
output_dir: Path | None = None,
wandb_project: str | None = None,
wandb_entity: str | None = None,
wandb_run_id: str | None = None,
wandb_upload_results: bool | None = None,
hf_upload_dir: str | None = None,
hf_upload_repo: str | None = None,
llm_args: dict[str, Any] | None = None,
judge_models_path: Path | None = None,
judge_model_name: str | None = None,
judge_model_args: dict[str, Any] | None = None,
batch_size: int | None = None,
description: str | None = None,
perturbation_type: str | None = None,
perturbation_probability: float | None = None,
perturbation_seed: int | None = None,
randomize_judge_order: bool = False,
delete_output_dir_after_upload: bool | None = None,
repeats: int | None = None,
) -> None:
self.llm_name = llm_name
self.models_path = models_path
self.num_samples = num_samples
self.max_tokens = max_tokens
self.num_fewshot = num_fewshot
self.task_name = task_name
self.task_subjects = task_subjects
self.hf_revision = hf_revision
self.output_dir = output_dir
self.wandb_project = wandb_project
self.wandb_entity = wandb_entity
self.wandb_run_id = wandb_run_id
self.wandb_upload_results = wandb_upload_results
self.hf_upload_dir = hf_upload_dir
self.hf_upload_repo = hf_upload_repo
self.llm_args = llm_args if llm_args is not None else {}
self.judge_models_path = judge_models_path
self.judge_model_name = judge_model_name
self.judge_model_args = judge_model_args if judge_model_args is not None else {}
self.batch_size = batch_size
self.description = description
self.randomize_judge_order = randomize_judge_order
self.delete_output_dir_after_upload = delete_output_dir_after_upload
self.repeats = repeats
if perturbation_type or perturbation_probability is not None:
perturbation = {
"type": perturbation_type,
"probability": perturbation_probability,
"seed": perturbation_seed,
}
self.perturbation_config: PerturbationConfig | None = PerturbationConfig(
**{k: v for k, v in perturbation.items() if v is not None}
)
else:
self.perturbation_config = None
self.config: EvalConfig | None = None
[docs]
def should_preempt(self) -> bool:
return False
[docs]
def get_trial_id(self) -> int | None:
return None