Source code for oumi.core.evaluation.backends.alpaca_eval
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from datetime import datetime
from pprint import pformat
from typing import Any
try:
import alpaca_eval # pyright: ignore[reportMissingImports]
except ImportError:
alpaca_eval = None
import pandas as pd
from oumi.builders.inference_engines import build_inference_engine
from oumi.core.configs import (
AlpacaEvalTaskParams,
EvaluationConfig,
InferenceConfig,
)
from oumi.core.distributed import is_world_process_zero
from oumi.core.evaluation.evaluation_result import EvaluationResult
from oumi.datasets.evaluation import AlpacaEvalDataset, utils
from oumi.utils.logging import logger
[docs]
def evaluate(
task_params: AlpacaEvalTaskParams,
config: EvaluationConfig,
) -> EvaluationResult:
"""Evaluates a model using the Alpaca Eval framework.
For detailed documentation on the AlpacaEval framework, we refer you to the
following readme: https://github.com/tatsu-lab/alpaca_eval.
Args:
task_params: The AlpacaEval parameters to use for evaluation.
config: The desired configuration for evaluation.
Returns:
The evaluation result (including metrics and their values).
"""
# Prerequisites
if not alpaca_eval:
raise RuntimeError(
"The `alpaca_eval` package is NOT installed. Please either install all "
"evaluation dependencies with `pip install oumi[evaluation]` or directly "
"install the missing package with `pip install alpaca_eval`."
)
open_ai_key = os.environ.get("OPENAI_API_KEY")
if not open_ai_key:
logger.warning(
"`OPENAI_API_KEY` environment variable is NOT set. If you are using an "
"OpenAI model as an annotator (judge), the execution will fail."
)
# Set the annotators config and metric function based on the version.
if task_params.version == 1.0:
os.environ["IS_ALPACA_EVAL_2"] = str(False)
annotators_config = "alpaca_eval_gpt4"
fn_metric = "get_winrate"
sort_by_metric = "win_rate"
elif task_params.version == 2.0:
os.environ["IS_ALPACA_EVAL_2"] = str(True)
annotators_config = "weighted_alpaca_eval_gpt4_turbo"
fn_metric = "get_length_controlled_winrate"
sort_by_metric = "length_controlled_winrate"
else:
raise ValueError(
"The `version` field in `AlpacaEvalTaskParams` must be either 1.0 or 2.0."
)
# Get a timestamp for the current run.
start_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
# Load the evaluation dataset.
logger.info("Loading the `tatsu-lab/alpaca_eval` dataset.")
alpaca_dataset = AlpacaEvalDataset(
dataset_name="tatsu-lab/alpaca_eval"
).conversations()
if task_params.num_samples:
alpaca_dataset = alpaca_dataset[: task_params.num_samples]
# Run inference for the alpaca_dataset.
logger.info("Running inference with {inference_engine_type}.")
logger.info(f"\tAlpacaEval inference `model_params`:\n{pformat(config.model)}")
logger.info(
f"\tAlpacaEval inference `generation_params`:\n{pformat(config.generation)}"
)
inference_config = InferenceConfig(
model=config.model,
generation=config.generation,
engine=config.inference_engine,
remote_params=config.inference_remote_params,
)
inference_engine = build_inference_engine(
engine_type=config.inference_engine,
model_params=config.model,
remote_params=config.inference_remote_params,
)
responses = inference_engine.infer(
input=alpaca_dataset, inference_config=inference_config
)
# Convert the model responses from Oumi format to Alpaca format.
generator_display_name = config.run_name or start_time_str # No run name? use time.
responses_json = utils.conversations_to_alpaca_format(responses)
responses_df = pd.DataFrame(responses_json)
responses_df["generator"] = generator_display_name
# Run AlpacaEval evaluation, i.e. annotate the model's responses.
logger.info("Running AlpacaEval annotation.")
logger.info(f"\tAlpacaEval `task_params`:\n{pformat(task_params)}")
df_leaderboard, _ = alpaca_eval.evaluate(
model_outputs=responses_df,
annotators_config=annotators_config,
fn_metric=fn_metric,
is_return_instead_of_print=True,
is_overwrite_leaderboard=True,
max_instances=task_params.num_samples,
sort_by=sort_by_metric,
**task_params.eval_kwargs,
) # type: ignore
# Metrics are only available on the main process, and `None` on others.
if not is_world_process_zero():
return EvaluationResult()
metric_dict = {}
if df_leaderboard is not None:
if generator_display_name in df_leaderboard.index:
metrics = df_leaderboard.loc[generator_display_name]
metric_dict: dict[str, Any] = {
str(metric): value for metric, value in metrics.items()
}
logger.info(f"AlpacaEval's metric dict is {pformat(metric_dict)}.")
else:
logger.error("AlpacaEval results not found in leaderboard.")
else:
logger.error("The `alpaca_eval` API did not return a leaderboard.")
backend_task_config = {
"IS_ALPACA_EVAL_2": os.environ.get("IS_ALPACA_EVAL_2", "None"),
"annotators_config": annotators_config,
"fn_metric": fn_metric,
"max_instances": task_params.num_samples,
"other_params": task_params.eval_kwargs,
"model_outputs": responses_json,
}
return EvaluationResult(
task_name=task_params.task_name,
task_result={"results": metric_dict},
backend_config=backend_task_config,
)