# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Supported models configuration for Oumi framework.
This module defines the configuration for all non-standard models in the Oumi framework,
including both language models (LLMs) and vision-language models (VLMs). It provides
a centralized registry of model configurations that specify how different models
should be handled during training, inference, and evaluation.
Note that most models should work without any special configuration, and therefore do
not need to be added to this module.
Key Components:
- InternalModelConfig: Configuration parameters for model behavior
- Model-specific config creators: Functions that create configs for specific models
- Registry functions: For looking up and accessing model configurations
- _ModelTypeInfo: Metadata for each supported model type
How to Add a New Model:
1. Create a configuration function::
def _create_my_model_config() -> InternalModelConfig:
config = InternalModelConfig()
# Configure the model's specific settings
config.chat_template = "my_template"
# Add any special features or parameters
return config
2. Add the model to get_all_models_map()::
_ModelTypeInfo(
model_type="my_model", # Must match HF config.model_type
model_class=transformers.AutoModelForCausalLM, # Or appropriate class
tested=False, # Set to True once tests are added
config=_create_my_model_config(),
)
3. For VLMs, configure visual features::
vlm_config = _create_default_vlm_config(
supports_multiple_images=True, # If model supports multiple images
pixel_values_variable_shape=True, # If images can have different sizes
)
# Add any model-specific image features
vlm_config.model_input_features.update({...})
"""
import copy
import functools
import types
from collections.abc import Mapping
from typing import Any, NamedTuple, cast
import transformers
from oumi.core.configs import ModelParams
from oumi.core.configs.internal.internal_model_config import (
InternalFeatureFirstDimAction,
InternalFeatureSpec,
InternalModelConfig,
InternalPaddingSide,
InternalVisualModelConfig,
)
from oumi.core.registry import REGISTRY, RegistryType
from oumi.utils.cache_utils import dict_cache
from oumi.utils.logging import logger
from oumi.utils.packaging import is_transformers_v5
[docs]
@dict_cache
def find_model_hf_config(
model_name: str,
*,
trust_remote_code: bool,
revision: str | None = None,
**kwargs: Any,
) -> transformers.PretrainedConfig:
"""Finds HF model config by model name."""
hf_config, unused_kwargs = transformers.AutoConfig.from_pretrained(
model_name,
trust_remote_code=trust_remote_code,
return_unused_kwargs=True,
revision=revision,
**kwargs,
)
if unused_kwargs:
logger.warning(
f"Unused kwargs found in '{model_name}' config: {unused_kwargs}."
)
return cast(transformers.PretrainedConfig, hf_config)
class _ModelTypeInfo(NamedTuple):
"""Metadata for a supported model type.
This class encapsulates all the information needed to support a specific model
type in the Oumi framework. Each supported model must have an entry with this
metadata in the model registry.
Attributes:
model_type: The model type identifier that matches the HuggingFace model's
config.model_type field. This is used to automatically detect and configure
models based on their type. Examples: "llama", "gpt2", "qwen2_vl", "llava".
model_class: The HuggingFace transformers class used to load this model type.
Common classes include:
- transformers.AutoModelForCausalLM: For standard language models
- transformers.AutoModelForVision2Seq: For vision-to-text models
- transformers.AutoModelForImageTextToText: For image+text to text models
config: The InternalModelConfig instance that defines how this model should
be configured. This includes settings like:
- Chat template to use for formatting conversations
- Input features the model expects (input_ids, pixel_values, etc.)
- Special tokenizer settings
- Visual model configuration for VLMs
tested: Whether this model configuration has been tested and verified to work
correctly with the framework. Set to True only after adding comprehensive
tests in test_supported_models.py.
"""
model_type: str
model_class: type
config: InternalModelConfig
tested: bool = False
def _create_default_vlm_config(
*,
supports_multiple_images: bool = False,
pixel_values_variable_shape: bool = False,
pixel_values_first_dim_action: InternalFeatureFirstDimAction = (
InternalFeatureFirstDimAction.DROP_IF_DUMMY
),
) -> InternalModelConfig:
"""Creates a default configuration for vision-language models.
This function provides a base configuration that can be used for most VLMs.
It sets up the basic visual features and configurations that VLMs typically need.
Args:
supports_multiple_images: Whether the model can process multiple images in a
single prompt. Models like MLLaMA support this, while others like early
LLAVA versions only support single images.
pixel_values_variable_shape: Whether the model can handle images of different
sizes within the same batch. When True, special handling is needed during
collation to group same-sized images or use batch_size=1.
pixel_values_first_dim_action: How to handle the first dimension of pixel_values
tensor. Options include:
- DROP_IF_DUMMY: Drop first dim if it's size 1 (common for single images)
- DROP_ALWAYS: Always drop the first dimension
- KEEP: Keep all dimensions as-is
Returns:
InternalModelConfig with basic VLM setup including:
- "llava" chat template as default
- pixel_values feature configuration
- Visual model configuration
Example:
>>> config = _create_default_vlm_config(supports_multiple_images=True)
>>> config.visual_config.supports_multiple_images
True
"""
config = InternalModelConfig()
config.chat_template = "llava"
config.model_input_features.update(
{
"pixel_values": InternalFeatureSpec(
name="pixel_values",
required=True,
variable_shape=pixel_values_variable_shape,
first_dim_action=pixel_values_first_dim_action,
image_dependent=True,
)
}
)
visual_config = InternalVisualModelConfig()
visual_config.supports_multiple_images = supports_multiple_images
visual_config.variable_shape_image_features = pixel_values_variable_shape
config.visual_config = visual_config
return config
def _create_gpt2_config() -> InternalModelConfig:
return InternalModelConfig(
chat_template="gpt2", tokenizer_pad_token="<|endoftext|>"
)
[docs]
@functools.cache
def get_default_vlm_model_config() -> InternalModelConfig:
"""Returns default VLM model config."""
return _create_default_vlm_config()
def _create_llava_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config()
config.chat_template = "llava"
assert config.visual_config is not None
config.processor_kwargs.update(
{"patch_size": 14, "vision_feature_select_strategy": "default"}
)
return config
def _create_blip2_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config()
config.chat_template = "default"
assert config.visual_config is not None
config.processor_kwargs.update({"num_query_tokens": 32})
return config
def _create_mllama_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(supports_multiple_images=True)
config.chat_template = "llama3-instruct"
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in (
"aspect_ratio_ids",
"aspect_ratio_mask",
"cross_attention_mask",
)
}
)
return config
def _create_qwen2_vl_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
# FIXME OPE-355 Set to True once multi-image issues are resolved for the model.
supports_multiple_images=False,
)
config.chat_template = "qwen2-vl-instruct"
# FIXME OPE-946 Consider updating to "right":
# config.padding_side = InternalPaddingSide.PAD_RIGHT
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("image_grid_thw",)
}
)
config.processor_kwargs.update(
{
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
}
)
return config
def _create_qwen2_5_vl_vlm_config() -> InternalModelConfig:
config = _create_qwen2_vl_vlm_config()
# Update default parameters that differ from Qwen2:
config.padding_side = InternalPaddingSide.PAD_RIGHT
config.processor_kwargs.update(
# Defaults per Qwen2.5-VL:
# https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py # noqa: E501
{
"min_pixels": 4 * 28 * 28,
"max_pixels": 16384 * 28 * 28,
}
)
return config
def _create_qwen3_vl_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
supports_multiple_images=True,
)
config.chat_template = "qwen3-vl-instruct"
# FIXME OPE-946 Consider updating to "right":
# config.padding_side = InternalPaddingSide.PAD_RIGHT
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("image_grid_thw",)
}
)
# mm_token_type_ids is produced by the Qwen3-VL processor with a leading
# batch dimension that must be stripped when processing individual examples.
config.model_input_features["mm_token_type_ids"] = InternalFeatureSpec(
name="mm_token_type_ids",
required=False,
variable_shape=False,
first_dim_action=InternalFeatureFirstDimAction.DROP_IF_DUMMY,
)
config.processor_kwargs.update(
# Defaults per Qwen3-VL:
# https://github.com/QwenLM/Qwen3-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
{
"min_pixels": 4 * 28 * 28,
"max_pixels": 16384 * 28 * 28,
"patch_size": 16,
}
)
return config
def _create_phi3_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
# FIXME OPE-355 Set to True once multi-image issues are resolved for the model.
supports_multiple_images=False,
)
config.chat_template = "phi3-instruct"
config.label_ignore_index = None
config.sanitize_negative_labels = True
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("image_sizes",)
}
)
return config
def _create_phi4_vlm_config() -> InternalModelConfig:
config = InternalModelConfig()
config.chat_template = "phi3-instruct"
config.ignore_features = [
"audio_attention_mask", # We won't use audio features.
"audio_embed_sizes",
"input_audio_embeds",
]
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=True,
image_dependent=True,
first_dim_action=InternalFeatureFirstDimAction.DROP_IF_DUMMY,
)
for feature_name in (
"input_image_embeds",
"image_attention_mask",
)
}
)
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("image_sizes",)
}
)
visual_config = InternalVisualModelConfig()
# FIXME OPE-355 Set to True once multi-image issues are resolved for the model.
visual_config.supports_multiple_images = False
visual_config.variable_shape_image_features = True
visual_config.main_image_feature = "input_image_embeds"
config.visual_config = visual_config
return config
def _create_internvl_config() -> InternalModelConfig:
config = _create_default_vlm_config(
pixel_values_variable_shape=True,
# FIXME OPE-355 Set to True once multi-image issues are resolved for the model.
supports_multiple_images=False,
)
config.chat_template = "internvl3"
# Add to processor to return key-values pairs (e.g., "pixel_values": torch.Tensor):
config.processor_kwargs.update({"return_dict": True})
assert (
config.model_input_features["pixel_values"].first_dim_action
== InternalFeatureFirstDimAction.DROP_IF_DUMMY
)
return config
def _create_idefics3_vlm_config() -> InternalModelConfig:
config = _create_default_vlm_config(
supports_multiple_images=True, pixel_values_variable_shape=True
)
# FIXME OPE-697 Create model-specific chat template
config.chat_template = "llava"
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=False,
image_dependent=True,
)
for feature_name in ("pixel_attention_mask",)
}
)
return config
def _create_molmo_vlm_config() -> InternalModelConfig:
"""Creates a config for Molmo VLM model.
Molmo uses a specific set of features including image masks and input indices
for handling images in the model. The config is set up to handle these
features appropriately.
"""
config = InternalModelConfig()
config.model_input_features.update(
{
feature_name: InternalFeatureSpec(
name=feature_name,
required=True,
variable_shape=True,
first_dim_action=InternalFeatureFirstDimAction.KEEP,
image_dependent=True
if feature_name in ("images", "image_masks", "image_input_idx")
else False,
)
for feature_name in (
"attention_mask",
"input_ids",
"labels",
"images",
"image_masks",
"image_input_idx",
)
}
)
config.chat_template = "molmo"
visual_config = InternalVisualModelConfig()
visual_config.supports_multiple_images = False
visual_config.variable_shape_image_features = True
visual_config.main_image_feature = "images"
config.visual_config = visual_config
return config
[docs]
@functools.cache
def get_all_models_map() -> Mapping[
str, # model type
_ModelTypeInfo,
]:
"""Creates a map of all supported models with their configurations.
This is the central registry of the non-standard models supported by the Oumi
framework. Each entry maps a model type (as defined in the HuggingFace model config)
to its corresponding configuration and metadata.
Returns:
An immutable mapping from model_type strings to _ModelTypeInfo objects.
The mapping includes both LLMs and VLMs with their specific configurations.
"""
default_vlm_config: InternalModelConfig = _create_default_vlm_config()
default_llm_class = transformers.AutoModelForCausalLM
if is_transformers_v5():
default_vlm_class = transformers.AutoModelForImageTextToText
else:
default_vlm_class = transformers.AutoModelForVision2Seq # type: ignore[attr-defined]
all_models_list: list[_ModelTypeInfo] = [
_ModelTypeInfo(
model_type="gpt2",
model_class=default_llm_class,
tested=True,
config=_create_gpt2_config(),
),
_ModelTypeInfo(
model_type="blip-2",
model_class=default_vlm_class,
tested=True,
config=_create_blip2_vlm_config(),
),
_ModelTypeInfo(
model_type="blip",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="chameleon",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="idefics",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="idefics2",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="idefics3",
model_class=default_vlm_class,
config=_create_idefics3_vlm_config(),
),
_ModelTypeInfo(
model_type="instructblip",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="llava",
model_class=default_vlm_class,
tested=True,
config=_create_llava_vlm_config(),
),
_ModelTypeInfo(
model_type="mllama",
model_class=default_vlm_class,
tested=True,
config=_create_mllama_vlm_config(),
),
_ModelTypeInfo(
model_type="paligemma",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="qwen2_vl",
model_class=default_vlm_class,
tested=True,
config=_create_qwen2_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="qwen2_5_vl",
model_class=default_vlm_class,
tested=True,
config=_create_qwen2_5_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="qwen3_vl",
model_class=default_vlm_class,
tested=True,
config=_create_qwen3_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="qwen3_vl_moe",
model_class=default_vlm_class,
config=_create_qwen3_vl_vlm_config(),
),
_ModelTypeInfo(
model_type="vipllava",
model_class=default_vlm_class,
config=copy.deepcopy(default_vlm_config),
),
_ModelTypeInfo(
model_type="molmo",
model_class=transformers.AutoModelForCausalLM,
config=_create_molmo_vlm_config(),
),
_ModelTypeInfo(
model_type="phi3_v",
model_class=transformers.AutoModelForCausalLM,
tested=True,
config=_create_phi3_vlm_config(),
),
_ModelTypeInfo(
model_type="phi4mm",
model_class=transformers.AutoModelForCausalLM,
config=_create_phi4_vlm_config(),
),
_ModelTypeInfo(
model_type="internvl",
model_class=transformers.AutoModelForImageTextToText,
config=_create_internvl_config(),
),
]
# Make it immutable.
return types.MappingProxyType({x.model_type: x for x in all_models_list})
[docs]
def get_custom_model_type_from_path(path: str) -> str | None:
"""Extracts model_type from a saved custom model directory's config.json."""
import json
from pathlib import Path as PathLib
config_path = PathLib(path) / "config.json"
if not config_path.exists():
return None
try:
with open(config_path, encoding="utf-8") as f:
config_data = json.load(f)
model_type = config_data.get("model_type")
if model_type and REGISTRY.contains(name=model_type, type=RegistryType.MODEL):
return model_type
except (json.JSONDecodeError, OSError):
pass
return None
[docs]
def is_custom_model(model_name: str) -> bool:
"""Determines whether the model is a custom model defined in oumi registry."""
if not model_name:
return False
if REGISTRY.contains(name=model_name, type=RegistryType.MODEL):
return True
return get_custom_model_type_from_path(model_name) is not None
[docs]
def find_internal_model_config_using_model_name(
model_name: str, trust_remote_code: bool
) -> InternalModelConfig | None:
"""Finds an internal model config for supported models using model name.
Args:
model_name: The model name, either:
- A HuggingFace model ID (e.g., "meta-llama/Llama-2-7b-hf")
- A local path to a model directory
- A custom model name registered in Oumi
trust_remote_code: Whether to trust external code associated with the model.
Required for some models like Qwen2-VL that use custom code.
Returns:
InternalModelConfig for the model if it's supported, or None if:
- The model is a custom Oumi model (handled separately)
- The model type is not in the supported models registry
"""
if is_custom_model(model_name):
return None
hf_config = find_model_hf_config(model_name, trust_remote_code=trust_remote_code)
llm_info = get_all_models_map().get(hf_config.model_type, None)
return llm_info.config if llm_info is not None else None
[docs]
def find_internal_model_config(
model_params: ModelParams,
) -> InternalModelConfig | None:
"""Finds an internal model config for supported models using `ModelParams`.
Args:
model_params: The model parameters.
Returns:
Model config, or `None` if model is not recognized.
"""
return find_internal_model_config_using_model_name(
model_params.model_name, model_params.trust_remote_code
)