oumi.utils

oumi.utils#

Submodules#

oumi.utils.analysis_utils module#

oumi.utils.analysis_utils.load_dataset_from_config(config: AnalyzeConfig) → BaseMapDataset[source]#

Load dataset based on configuration.

This function loads datasets directly from the registry for analysis purposes, avoiding the need for tokenizers and other training infrastructure.

oumi.utils.batching module#

oumi.utils.batching.batch(dataset: list[T], batch_size: int) → list[list[T]][source]#

Batches the provided dataset.

Parameters:

dataset – The dataset to batch, which is a flat list of items.
batch_size – The desired size of each batch.

Returns:

A list of batches. Each batch is a list of batch_size items, assuming that the dataset’s size is a multiple of batch_size. Otherwise, the last batch to be included will contain less items than batch_size.

oumi.utils.batching.unbatch(dataset: list[list[T]]) → list[T][source]#: Unbatches (flatten) the provided dataset.

oumi.utils.conversation_utils module#

oumi.utils.conversation_utils.base64encode_content_item_image_bytes(item: ContentItem, *, add_mime_prefix: bool = True) → str[source]#

Creates base-64 encoded image bytes as ASCII string value.

Parameters:

item – An input message content item of image type (one of IMAGE_BINARY, IMAGE_PATH, `IMAGE_URL) with the pre-populated binary field.
add_mime_prefix – Whether to add MIME prefix data:image/png;base64,

Returns:

String containing base64 encoded image bytes <BASE64_VALUE>. If add_mime_prefix is True, then the following format is used: data:image/png;base64,<BASE64_VALUE>.

oumi.utils.conversation_utils.convert_content_items_to_json_list(content_items: list[ContentItem]) → list[dict[str, Any]][source]#

Converts content items to a list of JSON dicts.

Parameters:: content_items – A list of content items.
Returns:: The list of all content items encoded as JSON dicts.
Return type:: list[Dict[str, Any]]

oumi.utils.conversation_utils.convert_message_content_item_to_json_dict(item: ContentItem) → dict[str, Any][source]#

Returns the content for a message content item.

Parameters:: item – The message content item to get the content for.
Returns:: The content for the message.
Return type:: Dict[str, Any]

oumi.utils.conversation_utils.convert_message_to_json_content(message: Message) → str | list[dict[str, Any]][source]#

Returns the message content.

Parameters:: message – The message to get the content for.
Returns:: The content for the message returned either as a single string, or as a list of content items.

oumi.utils.conversation_utils.convert_message_to_json_content_list(message: Message) → list[dict[str, Any]][source]#

Returns the message content as a list of its content items encoded as JSON dicts.

Parameters:: message – The message to get the content for.
Returns:: The content for the message for all content items.
Return type:: list[Dict[str, Any]]

oumi.utils.conversation_utils.create_list_of_message_json_dicts(messages: list[Message], *, group_adjacent_same_role_turns: bool) → list[dict[str, Any]][source]#

Returns a list of JSON dictionaries representing messages.

Loads image bytes and encodes them as base64.

Parameters:

messages – The input messages.
group_adjacent_same_role_turns – Whether to pack adjacent messages from the same role into a single element in output list.

Returns:

The list of messages encoded as nested JSON dicts.

Return type:

list[Dict[str, Any]]

oumi.utils.conversation_utils.load_image_bytes_to_content_item(item: ContentItem, mode: str = 'RGB') → ContentItem[source]#

Ensures that message content item contains inline image bytes if it’s an image.

Loads image content if image type is IMAGE_URL or IMAGE_PATH. Otherwise returns the input content item w/o any changes.

Parameters:

item – An input message content item.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

A content item guaranteed to be IMAGE_BINARY if an input content item was any of image types (IMAGE_URL, IMAGE_PATH, IMAGE_BINARY).

oumi.utils.conversation_utils.load_pil_image_from_content_item(image_item: ContentItem, mode: str = 'RGB') → Image[source]#

Loads a PIL image from a message content item.

Parameters:

image_item – A content item representing an image.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

A PIL image.

Return type:

Image.Image

oumi.utils.conversation_utils.remove_excessive_images(messages: list[Message], *, max_images: int) → list[Message][source]#

Returns a list of messages with excessive images dropped.

Parameters:

messages – The input messages.
max_images – The maximum number of images to keep. If the limit is exceeded, the first N images are retained, and the rest is discarded. If negative, all images are kept. If 0, all images are dropped.

Returns:

The list of messages with excessive images discarded.

Return type:

list[Message]

oumi.utils.conversation_utils.remove_excessive_images_from_conversation(conversation: Conversation, *, max_images: int) → Conversation[source]#

Returns a conversation with excessive images dropped.

Parameters:

conversation – The input conversation.
max_images – The maximum number of images to keep. If the limit is exceeded, the first N images are retained, and the rest is discarded. If negative, all images are kept. If 0, all images are dropped.

Returns:

The conversation with excessive images discarded.

Return type:

Conversation

oumi.utils.conversation_utils.truncate_text_in_content_items(messages: list[Message], tokenizer: PreTrainedTokenizerBase, *, max_tokens: int, truncation_side: str = 'right') → list[Message][source]#

Truncates text contents in Messages to max_length total tokens.

Note that we have to truncate plain texts before we apply chat template as the final processed prompt is generally unsafe to truncate at arbitrary offset: it may break invariants (e.g., prompt contains N images tokens) leading to runtime errors in processor.

Parameters:

messages – A list of messages.
tokenizer – The tokenizer used for encoding the data.
max_tokens – Maximum number of tokens to keep in all text pieces combined.
truncation_side – The side to truncate the tokens (“right” or “left”).

Returns:

A list of messages with potentially truncated text prompts. The returned list contains the same messages as the input list, except that the text content items may be truncated.

oumi.utils.debug_utils module#

oumi.utils.debug_utils.log_example_for_debugging(raw_example: Any, formatted_example: str, tokenized_example: list[tuple[int, str]], model_input: dict[str, Any]) → None[source]#

Logs an example of the data in each step for debugging purposes.

Parameters:

raw_example – The raw example from the dataset.
formatted_example – The formatted example after processing.
tokenized_example – The tokenized example after tokenization.
model_input – The final model input after collating.

oumi.utils.device_utils module#

class oumi.utils.device_utils.NVidiaGpuRuntimeInfo(device_index: int, device_count: int, used_memory_mb: float | None = None, temperature: int | None = None, fan_speed: int | None = None, fan_speeds: Sequence[int] | None = None, power_usage_watts: float | None = None, power_limit_watts: float | None = None, gpu_utilization: int | None = None, memory_utilization: int | None = None, performance_state: int | None = None, clock_speed_graphics: int | None = None, clock_speed_sm: int | None = None, clock_speed_memory: int | None = None)[source]#

Bases: NamedTuple

Contains misc NVIDIA GPU measurements and stats retrieved by pynvml.

The majority of fields are optional. You can control whether they are populated by setting boolean query parameters of _get_nvidia_gpu_runtime_info_impl(, …) such as memory, temperature, fan_speed, etc.

clock_speed_graphics: int | None#: Graphics clock speed (NVML_CLOCK_GRAPHICS) in MHz.

clock_speed_memory: int | None#: Memory clock speed (NVML_CLOCK_MEM) in MHz.

clock_speed_sm: int | None#: SM clock speed (NVML_CLOCK_SM) in MHz.

device_count: int#: Total number of GPU devices on this node.

device_index: int#: Zero-based device index.

fan_speed: int | None#: GPU fan speed in [0,100] range.

fan_speeds: Sequence[int] | None#

An array of GPU fan speeds.

The array’s length is equal to the number of fans per GPU (can be multiple). Speed values are in [0, 100] range.

gpu_utilization: int | None#

[0,100].

Type:: GPU compute utilization. Range

memory_utilization: int | None#

[0,100].

Type:: GPU memory utilization. Range

performance_state: int | None#

See nvmlPstates_t. Valid values are in [0,15] range, or 32 if unknown.

0 corresponds to Maximum Performance. 15 corresponds to Minimum Performance.

power_limit_watts: float | None#: GPU power limit in Watts.

power_usage_watts: float | None#: GPU power usage in Watts.

temperature: int | None#: GPU temperature in Celcius.

used_memory_mb: float | None#: Used GPU memory in MB.

oumi.utils.device_utils.get_nvidia_gpu_fan_speeds(device_index: int = 0) → Sequence[int][source]#: Returns the current fan speeds for NVIDIA GPU device.

oumi.utils.device_utils.get_nvidia_gpu_memory_utilization(device_index: int = 0) → float[source]#: Returns amount of memory being used on an Nvidia GPU in MiB.

oumi.utils.device_utils.get_nvidia_gpu_power_usage(device_index: int = 0) → float[source]#: Returns the current power usage for NVIDIA GPU device.

oumi.utils.device_utils.get_nvidia_gpu_runtime_info(device_index: int = 0) → NVidiaGpuRuntimeInfo | None[source]#: Returns runtime stats for Nvidia GPU.

oumi.utils.device_utils.get_nvidia_gpu_temperature(device_index: int = 0) → int[source]#: Returns the current temperature readings for the device, in degrees C.

oumi.utils.device_utils.log_nvidia_gpu_fan_speeds(device_index: int = 0, log_prefix: str = '') → None[source]#: Prints the current NVIDIA GPU fan speeds.

oumi.utils.device_utils.log_nvidia_gpu_memory_utilization(device_index: int = 0, log_prefix: str = '') → None[source]#: Prints amount of memory being used on an Nvidia GPU.

oumi.utils.device_utils.log_nvidia_gpu_power_usage(device_index: int = 0, log_prefix: str = '') → None[source]#: Prints the current NVIDIA GPU power usage.

oumi.utils.device_utils.log_nvidia_gpu_runtime_info(device_index: int = 0, log_prefix: str = '') → None[source]#: Prints the current NVIDIA GPU runtime info.

oumi.utils.device_utils.log_nvidia_gpu_temperature(device_index: int = 0, log_prefix: str = '') → None[source]#: Prints the current temperature readings for the device, in degrees C.

oumi.utils.distributed_utils module#

oumi.utils.distributed_utils.is_using_accelerate() → bool[source]#

Returns whether the current job was launched with the Accelerate launcher.

We do this by checking if the ACCELERATE_DYNAMO_* environment variables are set. These variables should always be set by Accelerate. We check for all of them in case Accelerate changes the environment variables in the future.

oumi.utils.distributed_utils.is_using_accelerate_fsdp() → bool[source]#: Returns whether the current job is requesting Accelerate FSDP training.

oumi.utils.git_utils module#

oumi.utils.git_utils.get_git_revision_hash() → str | None[source]#

Get the current git revision hash.

Returns:: The current git revision hash, or None if it cannot be retrieved.
Return type:: Optional[str]

oumi.utils.git_utils.get_git_root_dir() → Path | None[source]#

Get the root directory of the current git repository.

Returns:: The root directory of the current git repository, or None if it cannot be retrieved.
Return type:: Optional[str]

oumi.utils.git_utils.get_git_tag() → str | None[source]#

Get the current git tag.

Returns:: The current git tag, or None if it cannot be retrieved.
Return type:: Optional[str]

oumi.utils.grpo_utils module#

oumi.utils.grpo_utils.extract_prompt_images_completion_from_single_turn_conversation(example: dict) → tuple[str, list, str][source]#

Finds prompt, completion, and optional images in a single-turn conversation.

Parameters:: example – A dictionary containing the conversation JSON.
Returns:: A tuple containing the prompt, images, and completion. The list of images is empty for text-only conversations.

oumi.utils.grpo_utils.try_prepare_trl_grpo_dataset(dataset: Dataset | IterableDataset) → Dataset | IterableDataset[source]#: Prepares a dataset for GRPO_TRL processing.

oumi.utils.grpo_utils.try_prepare_trl_grpo_example(example: dict) → dict[source]#

Prepares an example for GRPO_TRL processing.

This function checks if the input example is one of known special cases e.g., SFT example, and transforms it into a GRPO compatible format. Otherwise, it returns the original example.

Parameters:: example (dict) – The input example.
Returns:: GRPO compatible example, or an original example.

oumi.utils.hf_utils module#

oumi.utils.hf_utils.find_hf_token() → str | None[source]#

Attempts to find HuggingFace access token.

Returns:: A valid HF access token, or None if not found.

oumi.utils.hf_utils.get_hf_chat_template(tokenizer_name: str, *, trust_remote_code: bool = False) → str | None[source]#: Returns chat template provided by HF for tokenizer_name.

oumi.utils.hf_utils.is_cached_to_disk_hf_dataset(dataset_folder: str | Path) → bool[source]#

Detects whether a dataset was saved using dataset.save_to_disk().

Such datasets should be loaded using datasets.Dataset.load_from_disk()

Returns:: Whether the dataset was saved using dataset.save_to_disk() method.

oumi.utils.http module#

async oumi.utils.http.get_failure_reason_from_response(response: ClientResponse) → str[source]#: Return a string describing the error from the provided response.

oumi.utils.http.is_non_retriable_status_code(status_code: int) → bool[source]#: Check if a status code is non-retriable.

oumi.utils.image_utils module#

oumi.utils.image_utils.convert_pil_image_mode(image: Image, *, mode: str | None) → Image[source]#

Converts a PIL image to the requested mode (if it’s not in that mode already) .

Parameters:

image – An input image.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

An image in the requested mode . If an input image was already in the correct mode then return it for efficiency. Otherwise, a different image object is returned.

oumi.utils.image_utils.create_png_bytes_from_image(pil_image: Image) → bytes[source]#

Encodes PIL image into PNG format, and returns PNG image bytes.

Parameters:: pil_image – An input image.
Returns:: PNG bytes representation of the image.
Return type:: bytes

oumi.utils.image_utils.create_png_bytes_from_image_bytes(image_bytes: bytes | None, mode: str = 'RGB') → bytes[source]#

Loads an image from raw image bytes, and converts to PNG image bytes.

Parameters:

image_bytes – A input image bytes. Can be in any image format supported by PIL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PNG bytes representation of the image.

Return type:

bytes

oumi.utils.image_utils.create_png_bytes_from_image_list(pil_images: list[Image]) → list[bytes][source]#

Encodes PIL images into PNG format, and returns PNG image bytes.

Parameters:: pil_images – A list of input images.
Returns:: A list of PNG-encoded images.

oumi.utils.image_utils.load_image_png_bytes_from_path(input_image_filepath: str | Path, mode: str = 'RGB') → bytes[source]#

Loads an image from a path, converts it to PNG, and returns image bytes.

Parameters:

input_image_filepath – A file path of an image. The image can be in any format supported by PIL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PNG bytes representation of the image.

Return type:

bytes

oumi.utils.image_utils.load_image_png_bytes_from_url(input_image_url: str, mode: str = 'RGB') → bytes[source]#

Loads an image from a URL, converts it to PNG, and returns image bytes.

Parameters:

input_image_url – An image URL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PNG bytes representation of the image.

Return type:

bytes

oumi.utils.image_utils.load_pdf_pages_from_bytes(pdf_bytes: bytes | None, *, dpi: int = 200, mode: str = 'RGB') → list[Image][source]#

Loads PDF pages as PIL images from raw PDF file bytes.

Parameters:

pdf_bytes – PDF file content.
dpi – Resolution to use for PDF page images (dots per inch).
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PDF pages as PIL images (PIL.Image.Image).

oumi.utils.image_utils.load_pdf_pages_from_path(input_pdf_filepath: str | Path, *, dpi: int = 200, mode: str = 'RGB') → list[Image][source]#

Loads PDF pages as PIL images from a path.

Parameters:

input_pdf_filepath – A file path of an PDF document.
dpi – Resolution to use for PDF page images (dots per inch).
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PDF pages as PIL images (PIL.Image.Image).

oumi.utils.image_utils.load_pdf_pages_from_url(pdf_url: str, *, dpi: int = 200, mode: str = 'RGB') → list[Image][source]#

Loads PDF pages as PIL images from from PDF URL.

Parameters:

pdf_url – A PDF URL.
dpi – Resolution to use for PDF page images (dots per inch).
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PDF pages as PIL images (PIL.Image.Image).

oumi.utils.image_utils.load_pil_image_from_bytes(image_bytes: bytes | None, mode: str = 'RGB') → Image[source]#

Loads an image from raw image bytes.

Parameters:

image_bytes – A input image bytes. Can be in any image format supported by PIL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PIL representation of the image.

Return type:

PIL.Image.Image

oumi.utils.image_utils.load_pil_image_from_path(input_image_filepath: str | Path, mode: str = 'RGB') → Image[source]#

Loads an image from a path.

Parameters:

input_image_filepath – A file path of an image. The image can be in any format supported by PIL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PNG bytes representation of the image.

Return type:

bytes

oumi.utils.image_utils.load_pil_image_from_url(input_image_url: str, mode: str = 'RGB') → Image[source]#

Loads a PIL image from a URL.

Parameters:

input_image_url – An image URL.
mode – The requested image mode e.g., “RGB”, “HSV”, “RGBA”, “P” (8-bit pixels, using a color palette). For details, see https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes

Returns:

PNG bytes representation of the image.

Return type:

bytes

oumi.utils.io_utils module#

oumi.utils.io_utils.get_oumi_root_directory() → Path[source]#

Get the root directory of the Oumi project.

Returns:: The absolute path to the Oumi project’s root directory.
Return type:: Path

oumi.utils.io_utils.load_file(filename: str | Path, encoding: str = 'utf-8') → str[source]#

Load a file as a string.

Parameters:

filename – Path to the file.
encoding – Encoding to use when reading the file. Defaults to “utf-8”.

Returns:

The content of the file.

Return type:

str

Raises:

FileNotFoundError – If the file doesn’t exist.

oumi.utils.io_utils.load_json(filename: str | Path) → Any[source]#

Load JSON data from a file.

Parameters:

filename – Path to the JSON file.

Returns:

Parsed JSON data.

Return type:

dict

Raises:

FileNotFoundError – If the file doesn’t exist.
json.JSONDecodeError – If the file contains invalid JSON.

oumi.utils.io_utils.load_jsonlines(filename: str | Path) → list[dict[str, Any]][source]#

Load a jsonlines file.

Parameters:

filename – Path to the jsonlines file.

Returns:

A list of dictionaries, each representing a: JSON object from the file.

Return type:

List[Dict[str, Any]]

Raises:

FileNotFoundError – If the file doesn’t exist.
jsonlines.InvalidLineError – If the file contains invalid JSON.

oumi.utils.io_utils.save_json(data: dict[str, Any], filename: str | Path, indent: int = 2) → None[source]#

Save data as a formatted JSON file.

Parameters:

data – The data to be saved as JSON.
filename – Path where the JSON file will be saved.
indent – Number of spaces for indentation. Defaults to 2.

Raises:

TypeError – If the data is not JSON serializable.

oumi.utils.io_utils.save_jsonlines(filename: str | Path, data: list[dict[str, Any]]) → None[source]#

Save a list of dictionaries to a jsonlines file.

Parameters:

filename – Path to the jsonlines file to be created or overwritten.
data – A list of dictionaries to be saved as JSON objects.

Raises:

IOError – If there’s an error writing to the file.

oumi.utils.logging module#

oumi.utils.logging.configure_dependency_warnings(level: str | int = 'info') → None[source]#

Ignores non-critical warnings from dependencies, unless in debug mode.

Parameters:: level (str, optional) – The log level to set for the logger. Defaults to “info”.

oumi.utils.logging.configure_logger(name: str, level: str = 'info', log_dir: str | Path | None = None) → None[source]#: Configures a logger with the specified name and log level.

oumi.utils.logging.get_logger(name: str, level: str = 'info', log_dir: str | Path | None = None) → Logger[source]#

Gets a logger instance with the specified name and log level.

Parameters:

name – The name of the logger.
level (optional) – The log level to set for the logger. Defaults to “info”.
log_dir (optional) – Directory to store log files. Defaults to None.

Returns:

The logger instance.

Return type:

logging.Logger

oumi.utils.logging.update_logger_level(name: str, level: str = 'info') → None[source]#

Updates the log level of the logger.

Parameters:

name (str) – The logger instance to update.
level (str, optional) – The log level to set for the logger. Defaults to “info”.

oumi.utils.math_utils module#

oumi.utils.math_utils.is_power_of_two(n: int) → bool[source]#

Check if a number is a power of two.

Parameters:: n (int) – The number to check.
Returns:: True if n is a power of two, False otherwise.
Return type:: bool

oumi.utils.model_caching module#

oumi.utils.model_caching.get_local_filepath_for_gguf(repo_id: str, filename: str, cache_dir='.cache/huggingface') → str[source]#

Return a local path for the provided GGUF file, downloading it if necessary.

Parameters:

repo_id – HuggingFace Hub repo ID (e.g., bartowski/Llama-3.2-3B-Instruct-GGUF)
filename – HuggingFace Hub filename (e.g., Llama-3.2-3B-Instruct-Q8_0.gguf)
cache_dir – Local path to cached models. Defaults to HUGGINGFACE_CACHE.

Returns:

A local path caching the GGUF file.

oumi.utils.packaging module#

class oumi.utils.packaging.PackagePrerequisites(package_name, min_package_version, max_package_version)#

Bases: tuple

max_package_version#: Alias for field number 2

min_package_version#: Alias for field number 1

package_name#: Alias for field number 0

oumi.utils.packaging.check_package_prerequisites(package_prerequisites: list[PackagePrerequisites], runtime_error_prefix: str = 'The current run cannot be launched because the platform prerequisites are not satisfied. In order to proceed, the following package(s) must be installed and have the correct version:\n', runtime_error_suffix: str = '') → None[source]#: Checks if the package prerequisites are satisfied and raises an error if not.

oumi.utils.peft_utils module#

oumi.utils.peft_utils.get_lora_rank(adapter_dir: str | Path) → int[source]#

Gets the LoRA rank for a saved adapter model.

Example config: huggingface/peft

Parameters:: adapter_dir – The directory containing the adapter model.
Returns:: The LoRA rank.
Return type:: int
Raises:: ValueError – If the LoRA rank is not found in the adapter config or isn’t an int.

oumi.utils.placeholders module#

class oumi.utils.placeholders.SafeDict(missing_values_allowed: bool, *args, **kwargs)[source]#

Bases: dict

__missing__(key: str) → str[source]#: Handle missing keys in the dictionary.

oumi.utils.placeholders.get_placeholders(text: str) → set[str][source]#: Extract placeholder variable names from text with {variable} syntax.

oumi.utils.placeholders.resolve_placeholders(text: str, values_dict: dict[str, str], missing_values_allowed: bool = False) → str[source]#: Resolve placeholder {variables} in the provided text from the values_dict.

oumi.utils.saver module#

oumi.utils.saver.load_infer_prob(input_filepath: str) → list[list[list[float]]][source]#: Retrieve batched probabilities from a parquet file.

oumi.utils.saver.load_infer_prob_csv(input_filepath: str) → list[list[list[float]]][source]#: Retrieve batched probabilities from a csv file.

oumi.utils.saver.save_infer_prob(output_filepath: str, probabilities: list[list[list[float]]])[source]#: Save batched probabilities into a parquet file.

oumi.utils.saver.save_infer_prob_csv(output_filepath: str, probabilities: list[list[list[float]]])[source]#: Save batched probabilities into a csv file.

oumi.utils.saver.str_to_float_list(input: str) → list[float][source]#

Convert an str representing a list of floats to an actual list of floats.

Example: input: [1.1, 2.2, 3.3] => output: [1.1, 2.2, 3.3]

oumi.utils.serialization_utils module#

class oumi.utils.serialization_utils.TorchJsonEncoder(*, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, sort_keys=False, indent=None, separators=None, default=None)[source]#

Bases: JSONEncoder

default(obj)[source]#: Extending python’s JSON Encoder to serialize torch dtype.

oumi.utils.serialization_utils.convert_all_keys_to_serializable_types(dictionary: dict) → None[source]#: Converts all keys in a hierarchical dictionary to serializable types.

oumi.utils.serialization_utils.flatten_config(config: Any, prefix: str = '', separator: str = '.') → dict[str, Any][source]#

Flattens a nested config object into a flat dictionary with dot notation keys.

Parameters:

config – The config object to flatten (dataclass, dict, or other)
prefix – The prefix to prepend to keys
separator – The separator to use between nested keys

Examples

>>> config = TrainingConfig(
>>>     model=ModelParams(
>>>         model_name="gpt2",
>>>     ),
>>>     training=TrainingParams(
>>>         batch_size=16,
>>>     ),
>>> )
>>> flatten_config(config)
{
    "model.model_name": "gpt2",
    "training.batch_size": 16,
}

Returns:: A flattened dictionary with string keys

oumi.utils.serialization_utils.json_serializer(obj: Any) → str[source]#: Serializes a Python obj to a JSON formatted string.

oumi.utils.str_utils module#

oumi.utils.str_utils.compute_utf8_len(s: str) → int[source]#: Computes string length in UTF-8 bytes.

oumi.utils.str_utils.get_editable_install_override_env_var() → bool[source]#: Returns whether OUMI_FORCE_EDITABLE_INSTALL env var is set to a truthy value.

oumi.utils.str_utils.sanitize_run_name(run_name: str | None) → str | None[source]#

Computes a sanitized version of wandb run name.

A valid run name may only contain alphanumeric characters, dashes, underscores, and dots, with length not exceeding max limit.

Parameters:: run_name – The original raw value of run name.

oumi.utils.str_utils.set_oumi_install_editable(setup: str) → str[source]#

Tries to replace oumi PyPi installs with editable installation from source.

For example, the following line:: pip install uv && uv pip -q install oumi[gpu,dev] vllm
will be replaced with:: pip install uv && uv pip -q install -e ‘.[gpu,dev]’ vllm

Parameters:: setup (str) – The bash setup script to modify. May be multi-line.
Returns:: The modified setup script.

oumi.utils.str_utils.str_to_bool(s: str) → bool[source]#

Convert a string representation to a boolean value.

This function interprets various string inputs as boolean values. It is case-insensitive and recognizes common boolean representations.

Parameters:: s – The string to convert to a boolean.
Returns:: The boolean interpretation of the input string.
Return type:: bool
Raises:: ValueError – If the input string cannot be interpreted as a boolean.

Examples

>>> str_to_bool("true")
True
>>> str_to_bool("FALSE")
False
>>> str_to_bool("1")
True
>>> str_to_bool("no")
False

oumi.utils.str_utils.truncate_text_pieces_to_max_tokens_limit(text_pieces: list[str], tokenizer: PreTrainedTokenizerBase, *, max_tokens: int, truncation_side: str = 'right') → list[str][source]#

Truncates text pieces to total length not exceeding max_length.

Parameters:

text_pieces – A list of text prompts.
tokenizer – The tokenizer used for encoding the data.
max_tokens – Maximum number of tokens to keep in all text pieces combined.
truncation_side – The side to truncate the tokens (“right” or “left”).

Returns:

A list of truncated text prompts.

oumi.utils.str_utils.truncate_to_max_tokens_limit(text: str, tokenizer: PreTrainedTokenizerBase, *, max_tokens: int, truncation_side: str = 'right') → tuple[str, int][source]#

Truncates text to max_length in tokens.

Parameters:

text – A text prompt.
tokenizer – The tokenizer used for encoding the data.
max_tokens – Maximum number of tokens to keep.
truncation_side – The side to truncate the tokens (“right” or “left”).

Returns:

A tuple containing truncated text prompt and the number of tokens.

oumi.utils.str_utils.try_str_to_bool(s: str) → bool | None[source]#

Attempts to convert a string representation to a boolean value.

This function interprets various string inputs as boolean values. It is case-insensitive and recognizes common boolean representations.

Parameters:

s – The string to convert to a boolean.

Returns:

The boolean interpretation of the input string, or None: for unrecognized string values.

Return type:

bool

Examples

>>> str_to_bool("true")
True
>>> str_to_bool("FALSE")
False
>>> str_to_bool("1")
True
>>> str_to_bool("no")
False
>>> str_to_bool("peach")
None

oumi.utils.torch_naming_heuristics module#

Utility functions which use detect-by-name heuristics.

# TODO(OPE-303): These should be replaced with something more robust.

oumi.utils.torch_naming_heuristics.disable_dropout(hf_config: PretrainedConfig) → None[source]#

Detects dropout probabilities in config and sets them to 0.0.

This essentially removes the dropout layer, which can aid the compiled model’s speed. Dropout is normally not used for LLM training, and also hinders the effectiveness of model compilation. We assume any attribute with “drop” in the name and a float value is a dropout param. For example, this includes attn_pdrop and summary_first_dropout for GPT2.

Parameters:: hf_config – The HuggingFace model config.

oumi.utils.torch_naming_heuristics.group_trainable_params(model: Module, weight_decay: float) → list[dict[str, Any]][source]#

Groups trainable params by weight decay for optimization.

As a rule of thumb, we generally want to weight decay all 2d matrices, i.e. weight tensors for matmuls/embeddings, and not biases/layernorms.

Parameters:

model – The model whose parameters will be optimized.
weight_decay – The weight decay to apply to the appropriate parameters.

Returns:

A list containing two dictionaries: the first with: parameters that should be weight decayed and the second with parameters that shouldn’t.

Return type:

List[Dict[str, Any]]

oumi.utils.torch_naming_heuristics.guess_transformer_layer_cls(model: Module) → type[Module][source]#: Guess the transformer layer class based on the model architecture.

oumi.utils.torch_naming_heuristics.resolve_transformer_layer_cls_string_as_module_set(class_names: str) → set[type[Module]][source]#: Get a module class from its string name.

oumi.utils.torch_naming_heuristics.simplify_transformer_layer_cls_string(class_names: str) → str[source]#

Replaces fully-qualified class names with pure class names.

For example, converts ‘foo.Block,foo.util.Decoder’ to ‘Block,Decoder’.

The accelerate library expects the simplified format, while OUMI trainer requires fully-qualified class names.

oumi.utils.torch_utils module#

class oumi.utils.torch_utils.ModelParameterCount(all_params: int, trainable_params: int, embedding_params: int)[source]#

Bases: object

__post_init__()[source]#: Ensure that the parameters are valid.

all_params: int#

embedding_params: int#

property frozen_params_percent: float#: Percentage of frozen parameters [0.0, 100.0].

trainable_params: int#

property trainable_params_percent: float#: Percentage of trainable parameters [0.0, 100.0].

oumi.utils.torch_utils.coerce_model_to_dtype(model: Module, dtype: dtype) → None[source]#

Coerces the model to the desired dtype.

This is needed as a temporary workaround to support QLoRA FSDP training. See: huggingface/accelerate#1620

oumi.utils.torch_utils.convert_to_list_of_tensors(values: list[T]) → list[Tensor][source]#: Converts a list of array-like objects into alist of torch tensors.

oumi.utils.torch_utils.count_model_parameters(model: Module) → ModelParameterCount[source]#

Creates a basic counter of the parameters in a neural model.

Parameters:: model – The torch-implemented neural network.
Returns:: A ModelParameterCount for the underlying model.
Return type:: ModelParameterCount

oumi.utils.torch_utils.create_model_summary(model: Any) → str[source]#: Creates a model summary as a free-formed string.

oumi.utils.torch_utils.create_ones_like(values: T) → T[source]#

Converts an array-like object into an object of the same type filled with 1-s.

Supports nested lists, in which case all elements must be of the same type.

oumi.utils.torch_utils.device_cleanup() → None[source]#: Empties gpu cache, good to do before and after training for cleanup.

oumi.utils.torch_utils.estimate_sample_dict_size_in_bytes(sample: dict[str, Any]) → int[source]#

Estimates the approximate total number of bytes in a provided sample.

Training sample is expected to be a dictionary, where a value is a list, tensor, or a numpy array.

The function works in best effort mode i.e., 100% accuaracy isn’t guaranteed. The implementation is slow, and shouldn’t be called in performance-sensitive code.

oumi.utils.torch_utils.format_cudnn_version(v: int | None) → str[source]#

Formats the cuDNN version number.

Parameters:: v – The cuDNN version number.
Returns:: A formatted string.

oumi.utils.torch_utils.freeze_model_layers(model: Module, freeze_layers: list[str]) → int[source]#

Recursively freezes model layers.

Parameters:

model – A model to freeze layers in.
freeze_layers – A list of layer names to freeze. Nested layers can be specified using a dot (‘.’) separator. For example, “visual.child.grandchild”. Layer names not found in the model are ignored.

Returns:

The total number of layers successfully frozen.

oumi.utils.torch_utils.get_device_name() → str[source]#: Returns the name of the device, assuming all are identical.

oumi.utils.torch_utils.get_dtype_size_in_bytes(dtype: str | dtype | dtype[Any] | None | type[Any] | _SupportsDType[dtype[Any]] | tuple[Any, int] | tuple[Any, SupportsIndex | Sequence[SupportsIndex]] | list[Any] | _DTypeDict | tuple[Any, Any]) → int[source]#: Returns size of this dtype in bytes.

oumi.utils.torch_utils.get_first_dim_len(x: Any) → int[source]#: Returns length of the first dimension.

oumi.utils.torch_utils.get_shape_as_list(x: Any) → list[int][source]#: Returns shape of an object (tensor or numpy array) as Python list.

oumi.utils.torch_utils.get_torch_dtype(torch_dtype_str: str) → dtype[source]#: Converts string dtype to torch.dtype.

oumi.utils.torch_utils.limit_per_process_memory(percent: float = 0.95) → None[source]#

Limits process memory by a certain percentage.

On Windows and WSL, there’s a pool of ‘shared gpu memory’. This pool is using the RAM (slow) on one’s machine rather than actual VRAM (fast). Setting this value ensures your machine never uses the slow memory and OOMs instead. Note that this may not be needed on Linux machines since this is an OS-level feature.

oumi.utils.torch_utils.log_devices_info(filepath: Path | None = None) → None[source]#: Logs high-level info about all available accelerator devices.

oumi.utils.torch_utils.log_model_summary(model, filepath: Path | None = None) → None[source]#: Logs a model summary.

oumi.utils.torch_utils.log_number_of_model_parameters(model: Module, use_icons: bool = True) → None[source]#

Logs the number of parameters of the model.

Parameters:

model – The torch-implemented neural network.
use_icons – Whether to display emojis/icons in the log output.

oumi.utils.torch_utils.log_peak_gpu_memory()[source]#: Log the peak GPU memory usage.

oumi.utils.torch_utils.log_versioning_info() → None[source]#: Logs misc versioning information.

oumi.utils.torch_utils.pad_sequences(sequences: list[T], *, padding_value: float = 0, padding_side: str | None = None) → Tensor[source]#

Pads a list of variable-length tensors to a single tensor.

Parameters:

sequences – list of variable length sequences.
padding_value – value for padded elements. Default: 0.
padding_side – side to apply padding to. Valid values: ‘right’, ‘left’. If unspecified (None), defaults to right.

Returns:

A tensor with shape (B, L, …), where B is a batch size (len(sequences)), L is the longest length (max(len(sequences[i])))

oumi.utils.torch_utils.pad_sequences_left_side(sequences: list[T], *, padding_value: float = 0) → Tensor[source]#

Pads a list of variable-length tensors to a single tensor.

Prepends padding_value to the left side of each sequence to expand to the longest length.

Parameters:

sequences – list of variable length sequences.
padding_value – value for padded elements. Default: 0.

Returns:

A tensor with shape (B, L, …), where B is a batch size (len(sequences)), L is the longest length (max(len(sequences[i])))

oumi.utils.torch_utils.pad_sequences_right_side(sequences: list[T], *, padding_value: float = 0) → Tensor[source]#

Pads a list of variable-length tensors to a single tensor.

Appends padding_value to the right side of each sequence to expand to the longest length.

Parameters:

sequences – list of variable length sequences.
padding_value – value for padded elements. Default: 0.

Returns:

A tensor with shape (B, L, …), where B is a batch size (len(sequences)), L is the longest length (max(len(sequences[i])))

oumi.utils.torch_utils.pad_to_max_dim_and_stack(tensors_list: list[T], *, max_variable_sized_dims: int = -1, padding_value: float = 0, padding_side: str | None = None) → Tensor[source]#

Stacks variable-length tensors to a single tensor with dimension expansion.

Some examples: 1) Two tensors with shapes [24,8], [32,8] are combined to [2,32,8]. 2) Two tensors with shapes [24,1,8], [32,4,8] are combined to [2,32,4,8]. 3) Three tensors with shapes [7,3,5],[8,2,6],[9,1,7] are combined to [3,9,3,7].

For 1D input tensors, the function is equivalent to pad_sequences().

If all tensors have the same shape and no padding is required, then the function is equivalent to torch.stack().

Parameters:

tensors_list – list of tensors with potentially .
max_variable_sized_dims – Maximum number of variable-sized dimensions. Negative values mean Unlimited. If you know that your tensors have a pre-defined number N of variable-sized dimensions (e.g., 1 for sequence_length) then it’s a good idea to set this parameter to catch abnormal inputs (ValueError will be raised in such cases).
padding_value – value for padded elements. Default: 0.
padding_side – side to apply padding to. Valid values: ‘right’, ‘left’. If unspecified (None), defaults to right.

Returns:

A tensor with shape (B, L, …), where B is a batch size (len(sequences)), L is the longest length (max(len(sequences[i])))

oumi.utils.verl_model_merger module#

This script is used to merge huggingface model and test verl checkpoints from FSDP and Megatron backends.

To merge FSDP checkpoints: `sh python scripts/model_merger.py merge --backend fsdp --local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor --target_dir /path/to/merged_hf_model `

To merge Megatron checkpoints: `sh python scripts/model_merger.py merge --backend megatron --tie-word-embedding --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor --target_dir /path/to/merged_hf_model `

For more details, please refer to documentation: https://verl.readthedocs.io/en/latest/advance/checkpoint.html#convert-fsdp-and-megatron-checkpoints-to-huggingface-format-model

class oumi.utils.verl_model_merger.BaseModelMerger(config: ModelMergerConfig)[source]#

Bases: ABC

get_transformers_auto_model_class()[source]#

abstractmethod merge_and_save()[source]#

patch_model_generation_config(model)[source]#

The generation_config created from model config may be different to the pretrained model, this may lead to error when generating: volcengine/verl#1246

This function patch the generation_config created from model config to the pretrained model.

save_hf_model_and_tokenizer(state_dict: dict[str, Tensor])[source]#

upload_to_huggingface()[source]#

class oumi.utils.verl_model_merger.FSDPModelMerger(config: ModelMergerConfig)[source]#

Bases: BaseModelMerger

merge_and_save()[source]#

class oumi.utils.verl_model_merger.MegatronModelMerger(config: ModelMergerConfig)[source]#

Bases: BaseModelMerger

merge_and_save()[source]#

class oumi.utils.verl_model_merger.ModelMergerConfig(operation: str, backend: str, local_dir: str, hf_model_config_path: str, target_dir: str | None = 'tmp', hf_upload_path: str | None = None, private: bool = False, test_hf_dir: str | None = None, tie_word_embedding: bool = False, is_value_model: bool = False, hf_model_path: str | None = None)[source]#

Bases: object

backend: str#

hf_model_config_path: str#

hf_model_path: str | None = None#

hf_upload: bool#

hf_upload_path: str | None = None#

is_value_model: bool = False#

local_dir: str#

operation: str#

private: bool = False#

target_dir: str | None = 'tmp'#

test_hf_dir: str | None = None#

tie_word_embedding: bool = False#

oumi.utils.verl_model_merger.main()[source]#

oumi.utils.version_utils module#

oumi.utils.version_utils.get_python_package_versions() → dict[str, str][source]#: Returns a dictionary of the installed package names and their versions.

oumi.utils.version_utils.is_dev_build() → bool[source]#: Checks if the current version of Oumi is a development build.

oumi.utils

Contents

oumi.utils#

Submodules#

oumi.utils.analysis_utils module#

oumi.utils.batching module#

oumi.utils.conversation_utils module#

oumi.utils.debug_utils module#

oumi.utils.device_utils module#

oumi.utils.distributed_utils module#

oumi.utils.git_utils module#

oumi.utils.grpo_utils module#

oumi.utils.hf_utils module#

oumi.utils.http module#

oumi.utils.image_utils module#

oumi.utils.io_utils module#

oumi.utils.logging module#

oumi.utils.math_utils module#

oumi.utils.model_caching module#

oumi.utils.packaging module#

oumi.utils.peft_utils module#

oumi.utils.placeholders module#

oumi.utils.saver module#

oumi.utils.serialization_utils module#

oumi.utils.str_utils module#

oumi.utils.torch_naming_heuristics module#

oumi.utils.torch_utils module#

oumi.utils.verl_model_merger module#

oumi.utils.version_utils module#