Source code for oumi.inference.remote_vllm_inference_engine
# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.fromtypingimportAny,Optionalfromtyping_extensionsimportoverridefromoumi.core.configsimportGenerationParams,ModelParamsfromoumi.core.types.conversationimportConversationfromoumi.inference.remote_inference_engineimportRemoteInferenceEngine_CONTENT_KEY:str="content"_ROLE_KEY:str="role"
[docs]classRemoteVLLMInferenceEngine(RemoteInferenceEngine):"""Engine for running inference against Remote vLLM."""@property@overridedefbase_url(self)->Optional[str]:"""Return the default base URL for the Remote vLLM API."""returnNone@property@overridedefapi_key_env_varname(self)->Optional[str]:"""Return the default environment variable name for the Remote vLLM API key."""returnNone
[docs]@overridedefget_supported_params(self)->set[str]:"""Returns a set of supported generation parameters for this engine."""return{"frequency_penalty","logit_bias","presence_penalty","seed","stop_strings","stop_token_ids","temperature","top_p","guided_decoding","max_new_tokens",}
@overridedef_convert_conversation_to_api_input(self,conversation:Conversation,generation_params:GenerationParams,model_params:ModelParams,)->dict[str,Any]:"""Converts a conversation to an OpenAI input. Documentation: https://platform.openai.com/docs/api-reference/chat/create Args: conversation: The conversation to convert. generation_params: Parameters for generation during inference. model_params: Model parameters to use during inference. Returns: Dict[str, Any]: A dictionary representing the OpenAI input. """ifmodel_params.adapter_model:model=model_params.adapter_modelelse:model=model_params.model_nameapi_input={"model":model,"messages":self._get_list_of_message_json_dicts(conversation.messages,group_adjacent_same_role_turns=True),"max_tokens":generation_params.max_new_tokens,# "max_completion_tokens": generation_params.max_new_tokens,# Future transition instead of `max_tokens`. See https://github.com/vllm-project/vllm/issues/9845"temperature":generation_params.temperature,"top_p":generation_params.top_p,"frequency_penalty":generation_params.frequency_penalty,"presence_penalty":generation_params.presence_penalty,"n":1,# Number of completions to generate for each prompt."seed":generation_params.seed,"logit_bias":generation_params.logit_bias,}ifgeneration_params.guided_decoding:ifgeneration_params.guided_decoding.json:api_input["guided_json"]=generation_params.guided_decoding.jsonelifgeneration_params.guided_decoding.regexisnotNone:api_input["guided_regex"]=generation_params.guided_decoding.regexelifgeneration_params.guided_decoding.choiceisnotNone:api_input["guided_choice"]=generation_params.guided_decoding.choiceifgeneration_params.stop_strings:api_input["stop"]=generation_params.stop_stringsifgeneration_params.stop_token_ids:api_input["stop_token_ids"]=generation_params.stop_token_idsreturnapi_input