API ReferenceΒΆ
Modules
- eval_framework package
- Subpackages
- Submodules
- eval_framework.base_config module
- eval_framework.evaluation_generator module
- eval_framework.exceptions module
- eval_framework.logger module
- eval_framework.main module
- eval_framework.response_generator module
- eval_framework.run module
- eval_framework.run_direct module
- Module contents
- eval_framework.context package
- eval_framework.llm package
- eval_framework.metrics package
- eval_framework.metrics.completion package
- Submodules
- eval_framework.metrics.completion.accuracy_completion module
- eval_framework.metrics.completion.aidanbench module
- eval_framework.metrics.completion.bleu module
- eval_framework.metrics.completion.chrf module
- eval_framework.metrics.completion.code_assertion module
- eval_framework.metrics.completion.code_execution_pass_at_one module
- eval_framework.metrics.completion.comet module
- eval_framework.metrics.completion.concordance_index module
- eval_framework.metrics.completion.csv_format module
- eval_framework.metrics.completion.cwe_accuracy module
- eval_framework.metrics.completion.exponential_similarity module
- eval_framework.metrics.completion.f1 module
- eval_framework.metrics.completion.format_checker module
- eval_framework.metrics.completion.grid_difference module
- eval_framework.metrics.completion.ifeval module
- eval_framework.metrics.completion.json_format module
- eval_framework.metrics.completion.language_checker module
- eval_framework.metrics.completion.length_control module
- eval_framework.metrics.completion.math_reasoning_completion module
- eval_framework.metrics.completion.niah_accuracy module
- eval_framework.metrics.completion.placeholder_checker module
- eval_framework.metrics.completion.repetition module
- eval_framework.metrics.completion.rouge_1 module
- eval_framework.metrics.completion.rouge_2 module
- eval_framework.metrics.completion.rouge_geometric_mean module
- eval_framework.metrics.completion.rouge_l module
- eval_framework.metrics.completion.struct_eval_metrics module
- eval_framework.metrics.completion.ter module
- eval_framework.metrics.completion.text_counter module
- Module contents
- eval_framework.metrics.efficiency package
- eval_framework.metrics.llm package
- Submodules
- eval_framework.metrics.llm.base module
- eval_framework.metrics.llm.llm_judge_chatbot_style module
- eval_framework.metrics.llm.llm_judge_coherence module
- eval_framework.metrics.llm.llm_judge_completion_accuracy module
- eval_framework.metrics.llm.llm_judge_conciseness module
- eval_framework.metrics.llm.llm_judge_contains_names module
- eval_framework.metrics.llm.llm_judge_format_correctness module
- eval_framework.metrics.llm.llm_judge_instruction module
- eval_framework.metrics.llm.llm_judge_mtbench_pair module
- eval_framework.metrics.llm.llm_judge_mtbench_single module
- eval_framework.metrics.llm.llm_judge_refusal module
- eval_framework.metrics.llm.llm_judge_sql module
- eval_framework.metrics.llm.llm_judge_world_knowledge module
- eval_framework.metrics.llm.utils module
- Module contents
- eval_framework.metrics.loglikelihood package
- Submodules
- eval_framework.metrics.loglikelihood.accuracy_loglikelihood module
- eval_framework.metrics.loglikelihood.base module
- eval_framework.metrics.loglikelihood.confidence_weighted_accuracy module
- eval_framework.metrics.loglikelihood.dcs module
- eval_framework.metrics.loglikelihood.probability_mass module
- eval_framework.metrics.loglikelihood.ternary module
- Module contents
- eval_framework.result_processors package
- eval_framework.tasks package
- Subpackages
- Submodules
- eval_framework.tasks.base module
- eval_framework.tasks.eval_config module
- eval_framework.tasks.perturbation module
- eval_framework.tasks.registry module
- eval_framework.tasks.task_loader module
- eval_framework.tasks.task_names module
- eval_framework.tasks.utils module
- Module contents
- eval_framework.tasks.benchmarks package
- Submodules
- eval_framework.tasks.benchmarks.aidanbench module
- eval_framework.tasks.benchmarks.arc module
- eval_framework.tasks.benchmarks.arc_de module
- eval_framework.tasks.benchmarks.arc_fi module
- eval_framework.tasks.benchmarks.belebele module
- eval_framework.tasks.benchmarks.bigcodebench module
- eval_framework.tasks.benchmarks.casehold module
- eval_framework.tasks.benchmarks.chembench module
- eval_framework.tasks.benchmarks.copa module
- eval_framework.tasks.benchmarks.duc module
- eval_framework.tasks.benchmarks.flores200 module
- eval_framework.tasks.benchmarks.flores_plus module
- eval_framework.tasks.benchmarks.gpqa module
- eval_framework.tasks.benchmarks.gsm8k module
- eval_framework.tasks.benchmarks.hellaswag module
- eval_framework.tasks.benchmarks.hellaswag_de module
- eval_framework.tasks.benchmarks.humaneval module
- eval_framework.tasks.benchmarks.ifeval module
- eval_framework.tasks.benchmarks.include module
- eval_framework.tasks.benchmarks.infinitebench module
- eval_framework.tasks.benchmarks.math_reasoning module
- eval_framework.tasks.benchmarks.mbpp module
- eval_framework.tasks.benchmarks.mmlu module
- eval_framework.tasks.benchmarks.mmlu_de module
- eval_framework.tasks.benchmarks.mmlu_pro module
- eval_framework.tasks.benchmarks.mmmlu module
- eval_framework.tasks.benchmarks.openbookqa module
- eval_framework.tasks.benchmarks.opengptx_eu20 module
- eval_framework.tasks.benchmarks.pawsx module
- eval_framework.tasks.benchmarks.piqa module
- eval_framework.tasks.benchmarks.quality module
- eval_framework.tasks.benchmarks.sciq module
- eval_framework.tasks.benchmarks.sphyr module
- eval_framework.tasks.benchmarks.squad module
- eval_framework.tasks.benchmarks.struct_eval module
- eval_framework.tasks.benchmarks.tablebench module
- eval_framework.tasks.benchmarks.triviaqa module
- eval_framework.tasks.benchmarks.truthfulqa module
- eval_framework.tasks.benchmarks.winogender module
- eval_framework.tasks.benchmarks.winogrande module
- eval_framework.tasks.benchmarks.winox module
- eval_framework.tasks.benchmarks.wmt module
- eval_framework.tasks.benchmarks.zero_scrolls module
- Module contents
- eval_framework