Source code for oumi.core.evaluation.backends.alpaca_eval
# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importosfromdatetimeimportdatetimefrompprintimportpformatfromtypingimportAnytry:importalpaca_eval# pyright: ignore[reportMissingImports]exceptImportError:alpaca_eval=Noneimportpandasaspdfromoumi.core.configsimport(AlpacaEvalTaskParams,EvaluationConfig,)fromoumi.core.distributedimportis_world_process_zerofromoumi.core.evaluation.evaluation_resultimportEvaluationResultfromoumi.core.inferenceimportBaseInferenceEnginefromoumi.datasets.evaluationimportAlpacaEvalDataset,utilsfromoumi.utils.loggingimportloggerALPACA_EVAL_TASK_NAME="alpaca_eval"
[docs]defevaluate(task_params:AlpacaEvalTaskParams,config:EvaluationConfig,inference_engine:BaseInferenceEngine,)->EvaluationResult:"""Evaluates a model using the Alpaca Eval framework. For detailed documentation on the AlpacaEval framework, we refer you to the following readme: https://github.com/tatsu-lab/alpaca_eval. Args: task_params: The AlpacaEval parameters to use for evaluation. config: The desired configuration for evaluation. inference_engine: The inference engine to use for generating responses. Returns: The evaluation result (including metrics and their values). """# Prerequisitesifnotalpaca_eval:raiseRuntimeError("The `alpaca_eval` package is NOT installed. Please either install all ""evaluation dependencies with `pip install oumi[evaluation]` or directly ""install the missing package with `pip install alpaca_eval`.")open_ai_key=os.environ.get("OPENAI_API_KEY")ifnotopen_ai_key:logger.warning("`OPENAI_API_KEY` environment variable is NOT set. If you are using an ""OpenAI model as an annotator (judge), the execution will fail.")# Set the annotators config and metric function based on the version.iftask_params.version==1.0:os.environ["IS_ALPACA_EVAL_2"]=str(False)annotators_config="alpaca_eval_gpt4"fn_metric="get_winrate"sort_by_metric="win_rate"eliftask_params.version==2.0:os.environ["IS_ALPACA_EVAL_2"]=str(True)annotators_config="weighted_alpaca_eval_gpt4_turbo"fn_metric="get_length_controlled_winrate"sort_by_metric="length_controlled_winrate"else:raiseValueError("The `version` field in `AlpacaEvalTaskParams` must be either 1.0 or 2.0.")# Get a timestamp for the current run.start_time_str=datetime.now().strftime("%Y%m%d_%H%M%S")# Load the evaluation dataset.logger.info("Loading the `tatsu-lab/alpaca_eval` dataset.")alpaca_dataset=AlpacaEvalDataset(dataset_name="tatsu-lab/alpaca_eval").conversations()iftask_params.num_samples:alpaca_dataset=alpaca_dataset[:task_params.num_samples]# Run inference for the alpaca_dataset.logger.info("\tAlpacaEval inference `model_params`:\n"f"{pformat(inference_engine._model_params)}\n""\tAlpacaEval inference `generation_params`:\n"f"{pformat(inference_engine._generation_params)}")responses=inference_engine.infer(input=alpaca_dataset)# Convert the model responses from Oumi format to Alpaca format.generator_display_name=config.run_nameorstart_time_str# No run name? use time.responses_json=utils.conversations_to_alpaca_format(responses)responses_df=pd.DataFrame(responses_json)responses_df["generator"]=generator_display_name# Run AlpacaEval evaluation, i.e. annotate the model's responses.logger.info("Running AlpacaEval annotation.")logger.info(f"\tAlpacaEval `task_params`:\n{pformat(task_params)}")df_leaderboard,_=alpaca_eval.evaluate(model_outputs=responses_df,annotators_config=annotators_config,fn_metric=fn_metric,is_return_instead_of_print=True,is_overwrite_leaderboard=True,max_instances=task_params.num_samples,sort_by=sort_by_metric,**task_params.eval_kwargs,)# type: ignore# Metrics are only available on the main process, and `None` on others.ifnotis_world_process_zero():returnEvaluationResult()metric_dict={}ifdf_leaderboardisnotNone:ifgenerator_display_nameindf_leaderboard.index:metrics=df_leaderboard.loc[generator_display_name]metric_dict:dict[str,Any]={str(metric):valueformetric,valueinmetrics.items()}logger.info(f"AlpacaEval's metric dict is {pformat(metric_dict)}.")else:logger.error("AlpacaEval results not found in leaderboard.")else:logger.error("The `alpaca_eval` API did not return a leaderboard.")backend_task_config={"IS_ALPACA_EVAL_2":os.environ.get("IS_ALPACA_EVAL_2","None"),"annotators_config":annotators_config,"fn_metric":fn_metric,"max_instances":task_params.num_samples,"other_params":task_params.eval_kwargs,}iftask_params.log_samples:backend_task_config["model_outputs"]=responses_jsonreturnEvaluationResult(task_name=task_params.task_name,task_result={"results":{ALPACA_EVAL_TASK_NAME:metric_dict}},backend_config=backend_task_config,)