Source code for eval_framework.context.eval

import importlib.util
import inspect
import sys
from contextlib import AbstractContextManager
from os import PathLike
from pathlib import Path
from typing import Any

import eval_framework
from eval_framework.llm.base import BaseLLM
from eval_framework.tasks.eval_config import EvalConfig
from eval_framework.tasks.perturbation import PerturbationConfig



[docs]
def import_models(models_file: PathLike | str) -> dict[str, type[BaseLLM]]:
    models_file = Path(models_file).resolve()
    library_path = Path(eval_framework.__path__[0]).resolve()

    # Imports from the eval_framework module need special care to avoid
    # import issues
    if models_file.is_relative_to(library_path):
        relative_path = models_file.relative_to(library_path.parent)
        module_name = ".".join(relative_path.with_suffix("").parts)
        module = importlib.import_module(module_name)
    else:
        module_name = models_file.stem

        spec = importlib.util.spec_from_file_location(module_name, str(models_file))

        if spec is None:
            raise ImportError(f"Could not load module '{models_file}'.")

        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module

        if spec.loader is None:
            raise ImportError(f"Could not load module '{models_file}'.")

        spec.loader.exec_module(module)

    subclasses = {}
    for name, clazz in inspect.getmembers(module, inspect.isclass):
        if issubclass(clazz, BaseLLM) and clazz is not BaseLLM:
            subclasses[name] = clazz

    return subclasses




[docs]
class EvalContext(AbstractContextManager):
    def __init__(
        self,
        llm_name: str,
        models_path: Path,
        num_samples: int | None = None,
        max_tokens: int | None = None,
        num_fewshot: int | None = None,
        task_name: str | None = None,
        task_subjects: list[str] | None = None,
        hf_revision: str | None = None,
        output_dir: Path | None = None,
        wandb_project: str | None = None,
        wandb_entity: str | None = None,
        wandb_run_id: str | None = None,
        wandb_upload_results: bool | None = None,
        hf_upload_dir: str | None = None,
        hf_upload_repo: str | None = None,
        llm_args: dict[str, Any] | None = None,
        judge_models_path: Path | None = None,
        judge_model_name: str | None = None,
        judge_model_args: dict[str, Any] | None = None,
        batch_size: int | None = None,
        description: str | None = None,
        perturbation_type: str | None = None,
        perturbation_probability: float | None = None,
        perturbation_seed: int | None = None,
        randomize_judge_order: bool = False,
        delete_output_dir_after_upload: bool | None = None,
        repeats: int | None = None,
    ) -> None:
        self.llm_name = llm_name
        self.models_path = models_path
        self.num_samples = num_samples
        self.max_tokens = max_tokens
        self.num_fewshot = num_fewshot
        self.task_name = task_name
        self.task_subjects = task_subjects
        self.hf_revision = hf_revision
        self.output_dir = output_dir
        self.wandb_project = wandb_project
        self.wandb_entity = wandb_entity
        self.wandb_run_id = wandb_run_id
        self.wandb_upload_results = wandb_upload_results
        self.hf_upload_dir = hf_upload_dir
        self.hf_upload_repo = hf_upload_repo
        self.llm_args = llm_args if llm_args is not None else {}
        self.judge_models_path = judge_models_path
        self.judge_model_name = judge_model_name
        self.judge_model_args = judge_model_args if judge_model_args is not None else {}
        self.batch_size = batch_size
        self.description = description
        self.randomize_judge_order = randomize_judge_order
        self.delete_output_dir_after_upload = delete_output_dir_after_upload
        self.repeats = repeats
        if perturbation_type or perturbation_probability is not None:
            perturbation = {
                "type": perturbation_type,
                "probability": perturbation_probability,
                "seed": perturbation_seed,
            }
            self.perturbation_config: PerturbationConfig | None = PerturbationConfig(
                **{k: v for k, v in perturbation.items() if v is not None}
            )
        else:
            self.perturbation_config = None

        self.config: EvalConfig | None = None


[docs]
    def should_preempt(self) -> bool:
        return False



[docs]
    def get_trial_id(self) -> int | None:
        return None