Source code for oumi.core.evaluation.backends.alpaca_eval

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from datetime import datetime
from pprint import pformat
from typing import Any

try:
    import alpaca_eval  # pyright: ignore[reportMissingImports]
except ImportError:
    alpaca_eval = None

import pandas as pd

from oumi.builders.inference_engines import build_inference_engine
from oumi.core.configs import (
    AlpacaEvalTaskParams,
    EvaluationConfig,
    InferenceConfig,
)
from oumi.core.distributed import is_world_process_zero
from oumi.core.evaluation.evaluation_result import EvaluationResult
from oumi.datasets.evaluation import AlpacaEvalDataset, utils
from oumi.utils.logging import logger


[docs] def evaluate( task_params: AlpacaEvalTaskParams, config: EvaluationConfig, ) -> EvaluationResult: """Evaluates a model using the Alpaca Eval framework. For detailed documentation on the AlpacaEval framework, we refer you to the following readme: https://github.com/tatsu-lab/alpaca_eval. Args: task_params: The AlpacaEval parameters to use for evaluation. config: The desired configuration for evaluation. Returns: The evaluation result (including metrics and their values). """ # Prerequisites if not alpaca_eval: raise RuntimeError( "The `alpaca_eval` package is NOT installed. Please either install all " "evaluation dependencies with `pip install oumi[evaluation]` or directly " "install the missing package with `pip install alpaca_eval`." ) open_ai_key = os.environ.get("OPENAI_API_KEY") if not open_ai_key: logger.warning( "`OPENAI_API_KEY` environment variable is NOT set. If you are using an " "OpenAI model as an annotator (judge), the execution will fail." ) # Set the annotators config and metric function based on the version. if task_params.version == 1.0: os.environ["IS_ALPACA_EVAL_2"] = str(False) annotators_config = "alpaca_eval_gpt4" fn_metric = "get_winrate" sort_by_metric = "win_rate" elif task_params.version == 2.0: os.environ["IS_ALPACA_EVAL_2"] = str(True) annotators_config = "weighted_alpaca_eval_gpt4_turbo" fn_metric = "get_length_controlled_winrate" sort_by_metric = "length_controlled_winrate" else: raise ValueError( "The `version` field in `AlpacaEvalTaskParams` must be either 1.0 or 2.0." ) # Get a timestamp for the current run. start_time_str = datetime.now().strftime("%Y%m%d_%H%M%S") # Load the evaluation dataset. logger.info("Loading the `tatsu-lab/alpaca_eval` dataset.") alpaca_dataset = AlpacaEvalDataset( dataset_name="tatsu-lab/alpaca_eval" ).conversations() if task_params.num_samples: alpaca_dataset = alpaca_dataset[: task_params.num_samples] # Run inference for the alpaca_dataset. logger.info("Running inference with {inference_engine_type}.") logger.info(f"\tAlpacaEval inference `model_params`:\n{pformat(config.model)}") logger.info( f"\tAlpacaEval inference `generation_params`:\n{pformat(config.generation)}" ) inference_config = InferenceConfig( model=config.model, generation=config.generation, engine=config.inference_engine, remote_params=config.inference_remote_params, ) inference_engine = build_inference_engine( engine_type=config.inference_engine, model_params=config.model, remote_params=config.inference_remote_params, ) responses = inference_engine.infer( input=alpaca_dataset, inference_config=inference_config ) # Convert the model responses from Oumi format to Alpaca format. generator_display_name = config.run_name or start_time_str # No run name? use time. responses_json = utils.conversations_to_alpaca_format(responses) responses_df = pd.DataFrame(responses_json) responses_df["generator"] = generator_display_name # Run AlpacaEval evaluation, i.e. annotate the model's responses. logger.info("Running AlpacaEval annotation.") logger.info(f"\tAlpacaEval `task_params`:\n{pformat(task_params)}") df_leaderboard, _ = alpaca_eval.evaluate( model_outputs=responses_df, annotators_config=annotators_config, fn_metric=fn_metric, is_return_instead_of_print=True, is_overwrite_leaderboard=True, max_instances=task_params.num_samples, sort_by=sort_by_metric, **task_params.eval_kwargs, ) # type: ignore # Metrics are only available on the main process, and `None` on others. if not is_world_process_zero(): return EvaluationResult() metric_dict = {} if df_leaderboard is not None: if generator_display_name in df_leaderboard.index: metrics = df_leaderboard.loc[generator_display_name] metric_dict: dict[str, Any] = { str(metric): value for metric, value in metrics.items() } logger.info(f"AlpacaEval's metric dict is {pformat(metric_dict)}.") else: logger.error("AlpacaEval results not found in leaderboard.") else: logger.error("The `alpaca_eval` API did not return a leaderboard.") backend_task_config = { "IS_ALPACA_EVAL_2": os.environ.get("IS_ALPACA_EVAL_2", "None"), "annotators_config": annotators_config, "fn_metric": fn_metric, "max_instances": task_params.num_samples, "other_params": task_params.eval_kwargs, "model_outputs": responses_json, } return EvaluationResult( task_name=task_params.task_name, task_result={"results": metric_dict}, backend_config=backend_task_config, )