Source code for oumi.cli.evaluate

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Annotated

import typer
from rich.table import Table

import oumi.cli.cli_utils as cli_utils
from oumi.cli.alias import AliasType, try_get_config_name_for_alias
from oumi.utils.logging import logger


[docs] def evaluate( ctx: typer.Context, config: Annotated[ str, typer.Option( *cli_utils.CONFIG_FLAGS, help="Path to the configuration file for training." ), ], level: cli_utils.LOG_LEVEL_TYPE = None, ): """Evaluate a model. Args: ctx: The Typer context object. config: Path to the configuration file for evaluation. level: The logging level for the specified command. """ extra_args = cli_utils.parse_extra_cli_args(ctx) config = str( cli_utils.resolve_and_fetch_config( try_get_config_name_for_alias(config, AliasType.EVAL), ) ) with cli_utils.CONSOLE.status( "[green]Loading configuration...[/green]", spinner="dots" ): # Delayed imports from oumi import evaluate as oumi_evaluate from oumi.core.configs import EvaluationConfig # End imports # Load configuration parsed_config: EvaluationConfig = EvaluationConfig.from_yaml_and_arg_list( config, extra_args, logger=logger ) parsed_config.finalize_and_validate() # Run evaluation with cli_utils.CONSOLE.status( "[green]Running evaluation...[/green]", spinner="dots" ): results = oumi_evaluate(parsed_config) # Make a best-effort attempt at parsing metrics. for task_result in results: table = Table( title="Evaluation Results", title_style="bold magenta", show_lines=True, ) table.add_column("Benchmark", style="cyan") table.add_column("Metric", style="yellow") table.add_column("Score", style="green") table.add_column("Std Error", style="dim") parsed_results = task_result.get("results", {}) if not isinstance(parsed_results, dict): continue for task_name, metrics in parsed_results.items(): # Get the benchmark display name from our benchmarks list if not isinstance(metrics, dict): # Skip if the metrics are not in a dict format table.add_row( task_name, "<unknown>", "<unknown>", "-", ) continue benchmark_name: str = metrics.get("alias", task_name) # Process metrics for metric_name, value in metrics.items(): metric_name: str = str(metric_name) if isinstance(value, (int, float)): # Extract base metric name and type base_name, *metric_type = metric_name.split(",") # Skip if this is a stderr metric # we'll handle it with the main metric if base_name.endswith("_stderr"): continue # Get corresponding stderr if it exists stderr_key = f"{base_name}_stderr,{metric_type[0] if metric_type else 'none'}" # noqa E501 stderr_value = metrics.get(stderr_key) stderr_display = ( f{stderr_value:.2%}" if stderr_value is not None else "-" ) # Clean up metric name clean_metric = base_name.replace("_", " ").title() if isinstance(value, float): if value > 1: value_str = f"{value:.2f}" else: value_str = f"{value:.2%}" else: # Includes ints value_str = str(value) table.add_row( benchmark_name, clean_metric, value_str, stderr_display, ) cli_utils.CONSOLE.print(table)