Source code for oumi.core.configs.internal.internal_model_config
# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, NamedTuple
from oumi.core.configs.base_config import BaseConfig
from oumi.core.constants import LABEL_IGNORE_INDEX
[docs]
class InternalPaddingSide(Enum):
"""Enum representing how to do padding for the model."""
PAD_LEFT = "left"
"""Left padding."""
PAD_RIGHT = "right"
"""Right padding."""
[docs]
class InternalFeatureFirstDimAction(Enum):
"""Enum representing how to handle the first feature dimension in datasets."""
DROP_ALWAYS = "drop_always"
"""The first dimension is commonly dummy (length: 1) and must be dropped.
In effect, this operation is applied: `x = x[0, ...]`, which reduces
`x`'s rank by 1 (e.g., 3D->2D), and discards the following elements: `x[1:, ...]`.
"""
DROP_IF_DUMMY = "drop_if_dummy"
"""Drop the first dimension only if it's dummy (length: 1)."""
KEEP = "keep"
"""Always preserve the first dimension."""
[docs]
class InternalFeatureSpec(NamedTuple):
name: str
"""Feature name."""
required: bool = False
"""Whether the feature must be always present (vs optional)."""
variable_shape: bool = False
"""Whether the feature can be of variable shape.
For example, `input_ids` is normally of variable length.
"""
first_dim_action: InternalFeatureFirstDimAction = (
InternalFeatureFirstDimAction.DROP_ALWAYS
)
"""Action to apply to the first feature dimension."""
image_dependent: bool = False
"""Whether the feature depends on image data.
For example, `pixel_values`, `cross_attention_mask`.
"""
[docs]
@dataclass
class InternalVisualModelConfig(BaseConfig):
main_image_feature: str = "pixel_values"
"""The key corresponding to the main image feature consumed by the model.
E.g., raw pixels, transformed image patches, etc. resulting from data
preprocessing and consumed by the underlying model."""
variable_shape_image_features: bool = False
"""Whether image features can be of variable shape.
In this case, the image features can be difficult to collate
(`torch.stack()` requires compatible shapes) and some workaround
is needed: either require `batch_size=1`, or group examples
so that each mini-batch only contains same-sized features.
"""
supports_multiple_images: bool = False
"""Whether the visual language model supports multiple images in one prompt."""
def _default_model_input_features_factory() -> dict[str, InternalFeatureSpec]:
result_list: list[InternalFeatureSpec] = [
InternalFeatureSpec(name="input_ids", required=True, variable_shape=True),
InternalFeatureSpec(name="attention_mask", required=False, variable_shape=True),
InternalFeatureSpec(name="labels", required=False, variable_shape=True),
]
return {x.name: x for x in result_list}
[docs]
@dataclass
class InternalModelConfig(BaseConfig):
model_type: str = ""
"""Model type."""
chat_template: str = "default"
"""Default chat template."""
tokenizer_pad_token: str | None = None
"""The default padding token used by the tokenizer.
If specified in internal model type config and unspecified
in `ModelParams.tokenizer_pad_token`, then this value will be used.
"""
padding_side: InternalPaddingSide | None = None
"""Padding side for the model."""
model_input_features: dict[str, InternalFeatureSpec] = field(
default_factory=_default_model_input_features_factory
)
"""Model input features specs."""
label_ignore_index: int | None = LABEL_IGNORE_INDEX
"""Special label value to be excluded from loss computation."""
sanitize_negative_labels: bool = False
"""Replace negative label values.
Some VLM processors can generate negative `input_ids` for image tokens,
which can cause problems if a negative integer is used as a label
to compute loss e.g., cross-entropy loss may expect [0, num_classes) range.
"""
processor_kwargs: dict[str, Any] = field(default_factory=dict)
"""Extra params to pass to processor constructor."""
ignore_features: list[str] = field(default_factory=list)
"""Features from processing the input to ignore in the model's forward method."""
visual_config: InternalVisualModelConfig | None = None
"""Configuration specific to visual models."""