Source code for oumi.core.evaluation.evaluator

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import copy
import time
from dataclasses import fields
from datetime import datetime
from typing import Any, Callable, Optional, Union

from oumi.core.configs import (
    AlpacaEvalTaskParams,
    EvaluationConfig,
    EvaluationTaskParams,
    LMHarnessTaskParams,
)
from oumi.core.configs.params.evaluation_params import EvaluationBackend
from oumi.core.evaluation.backends.alpaca_eval import evaluate as evaluate_alpaca_eval
from oumi.core.evaluation.backends.lm_harness import evaluate as evaluate_lm_harness
from oumi.core.evaluation.evaluation_result import EvaluationResult
from oumi.core.registry import REGISTRY
from oumi.evaluation.platform_prerequisites import check_prerequisites
from oumi.evaluation.save_utils import save_evaluation_output


[docs] class Evaluator: """A class for evaluating language models on various tasks. Currently, the evaluator supports a wide range of tasks that are handled by three separate backends: LM Harness, Alpaca Eval, and Custom. - LM Harness: Framework by EleutherAI for evaluating language models (mostly) on standardized benchmarks (multiple-choice, word match, etc). The backend supports a large number of popular benchmarks, which can be found at: https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks. - Alpaca Eval: Framework for evaluating the instruction-following capabilities of language models, as well as whether their responses are helpful, accurate, and relevant. The instruction set consists of 805 open-ended questions, while the evaluation is based on "LLM-as-judge" and prioritizes human-alignment, aiming to assess whether the model responses meet the expectations of human evaluators. - Custom: Users can register their own evaluation functions using the decorator `@register_evaluation_function` and run custom evaluations based on their functions. Note that the `task_name` should be the registry key for the custom evaluation function to be used. """
[docs] def evaluate(self, config: EvaluationConfig, **kwargs) -> list[EvaluationResult]: """Evaluates a model using the provided evaluation configuration. Args: config: The desired configuration for evaluation. kwargs: Additional keyword arguments required by evaluator backends. Returns: List of evaluation results (one per task, in the same order with `tasks`). """ # Create a copy of the evaluation config, without tasks, so that there is no # redundant information in the `config` input parameter of `self.evaluate_task`. config_without_tasks = copy.deepcopy(config) config_without_tasks.tasks = [] # Evaluate on each task included in the configuration, serially. evaluation_results = [] for task in config.tasks: evaluation_result = self.evaluate_task( task_params=task, config=config_without_tasks, **kwargs ) evaluation_results.append(evaluation_result) return evaluation_results
[docs] def evaluate_task( self, task_params: EvaluationTaskParams, config: EvaluationConfig, **kwargs, ) -> EvaluationResult: """Evaluates a model using the provided configuration on a specific task. Args: task_params: The task parameters for evaluation. config: The desired evaluation configuration for evaluation. kwargs: Additional keyword arguments required by evaluator backends. Returns: The results for evaluating on the task. """ # Find the proper backend to execute the evaluation task. evaluation_backend: EvaluationBackend = task_params.get_evaluation_backend() # Ensure the task prerequisites are satisfied; fast-fail if not. check_prerequisites( evaluation_backend=evaluation_backend, task_name=task_params.task_name, ) # Get a timestamp at the beginning of the current run. start_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") start_time = time.time() # Redirect the evaluation execution to the appropriate evaluation backend. if evaluation_backend == EvaluationBackend.LM_HARNESS: lm_harness_task_params = Evaluator._get_backend_task_params(task_params) assert isinstance(lm_harness_task_params, LMHarnessTaskParams) evaluation_result = evaluate_lm_harness( task_params=lm_harness_task_params, config=config, **kwargs, # random_seed, numpy_random_seed, torch_random_seed ) elif evaluation_backend == EvaluationBackend.ALPACA_EVAL: alpaca_eval_task_params = Evaluator._get_backend_task_params(task_params) assert isinstance(alpaca_eval_task_params, AlpacaEvalTaskParams) evaluation_result = evaluate_alpaca_eval( task_params=alpaca_eval_task_params, config=config, **kwargs, ) elif evaluation_backend == EvaluationBackend.CUSTOM: evaluation_fn = Evaluator._get_custom_evaluation_fn(task_params.task_name) evaluation_result = evaluation_fn( task_params=task_params, config=config, **kwargs, ) else: raise ValueError(f"Unknown evaluation backend: {evaluation_backend}") # Calculate the elapsed time for the evaluation run. evaluation_result.elapsed_time_sec = int(time.time() - start_time) evaluation_result.start_time = start_time_str # Save the output, if an output directory has been provided. if config.output_dir: self.save_output( task_params=task_params, evaluation_result=evaluation_result, base_output_dir=config.output_dir, config=config, ) return evaluation_result
[docs] def save_output( self, task_params: EvaluationTaskParams, evaluation_result: EvaluationResult, base_output_dir: str, config: Optional[EvaluationConfig], ) -> None: """Saves the evaluation's output to the specified output directory. Args: task_params: The task parameters used for this evaluation. evaluation_result: The evaluation result. base_output_dir: The directory where the evaluation results will be saved. config: The evaluation configuration. Returns: None """ save_evaluation_output( backend_name=task_params.evaluation_backend, task_params=task_params, evaluation_result=evaluation_result, base_output_dir=base_output_dir, config=config, )
@staticmethod def _get_custom_evaluation_fn(task_name: Optional[str]) -> Callable: """Retrieve the evaluation function of the custom task.""" if not task_name: raise ValueError( "Missing `task_name` for custom Oumi evaluation. Please specify the " "task name, which should be corresponding to a registered evaluation " "function, using the decorator `@register_evaluation_function`." ) if evaluation_fn := REGISTRY.get_evaluation_function(task_name): return evaluation_fn else: raise ValueError( f"Task name `{task_name}` not found in the registry. For custom Oumi " "evaluations, the task name must match the name of a registered " "evaluation function. You can register a new function with the " "decorator `@register_evaluation_function`." ) @staticmethod def _get_backend_task_params( task_params: EvaluationTaskParams, ) -> Union[LMHarnessTaskParams, AlpacaEvalTaskParams]: """Returns the evaluation backend-specific task parameters.""" if task_params.get_evaluation_backend() == EvaluationBackend.LM_HARNESS: target_class = LMHarnessTaskParams elif task_params.get_evaluation_backend() == EvaluationBackend.ALPACA_EVAL: target_class = AlpacaEvalTaskParams elif task_params.get_evaluation_backend() == EvaluationBackend.CUSTOM: raise ValueError( "The custom evaluation backend is not subclassing EvaluationTaskParams." " Thus, `Evaluator._get_backend_task_params()` should not be called " " when evaluation_backend is set to `EvaluationBackend.CUSTOM`." ) else: raise ValueError(f"Unknown backend: {task_params.evaluation_backend}") init_kwargs = Evaluator._get_init_kwargs_for_task_params_class( task_params=task_params, target_class=target_class ) return target_class(**init_kwargs) @staticmethod def _get_init_kwargs_for_task_params_class( task_params: EvaluationTaskParams, target_class: type[EvaluationTaskParams], ) -> dict[str, Any]: """Returns the init keyword arguments for a `target_class` of name *TaskParams. Given a target class of name <evaluation backend>TaskParams, which subclasses `EvaluationTaskParams`, this method returns a 'flattened' dict with all arguments needed to instantiate it. The dict includes all the parameters which are already members of `EvaluationTaskParams`, as well as additional parameters which are only known to the target class (stored under `eval_kwargs`). By 'flattened', we mean that all known parameters that are stored under the `eval_kwargs` dict are moved one level up, to the (flat) dict that is returned. In contrast, all unknown (to the target class) parameters remain (unflattened) inside the `eval_kwargs` dict. Example: Assuming these are the input parameters: task_params: EvaluationTaskParams( # <- `num_fewshot` is NOT a member evaluation_backend=EvaluationBackend.LM_HARNESS, task_name="mmlu", eval_kwargs={"num_fewshot": 10, "some_param": 20}, ) target_class: LMHarnessTaskParams # <- `num_fewshot` is a member This function will return: { "evaluation_backend": EvaluationBackend.LM_HARNESS, "task_name": "mmlu", "num_fewshot": 10, "eval_kwargs": {"some_param": 20} } """ task_params = copy.deepcopy(task_params) # Find all keys in `eval_kwargs` which are known to the target class. known_keys = [] if task_params.eval_kwargs: field_names = [field.name for field in fields(target_class)] known_keys.extend(k for k in task_params.eval_kwargs if k in field_names) # Identify all kwargs known to the current class. init_keys = [ key for key in dir(task_params) if not callable(getattr(task_params, key)) and not key.startswith("_") ] init_kwargs = {key: getattr(task_params, key) for key in init_keys} # Move known kwargs one level up: from `eval_kwargs` to the top-level dict. for key in known_keys: if key in init_kwargs: raise ValueError( f"Parameter `{key}` is present twice, in both task parameters and " "`eval_kwargs` dictionary. Please remove it from one of them." ) init_kwargs[key] = init_kwargs["eval_kwargs"].pop(key) return init_kwargs