oumi.core.processors#

Submodules#

oumi.core.processors.base_image_processor module#

class oumi.core.processors.base_image_processor.BaseImageProcessor[source]#

Bases: ABC

Base class for oumi image processors.

abstract __call__(*, images: list[Image], return_tensors: str | None = 'pt') BatchFeature[source]#

Extracts image features.

Parameters:
  • images – A list of input images.

  • return_tensors – The format of returned tensors.

Returns:

The model-specific input features.

Return type:

transformers.BatchFeature

oumi.core.processors.base_processor module#

class oumi.core.processors.base_processor.BaseProcessor[source]#

Bases: ABC

Base class for oumi processors.

The high-level purpose of a processor is to generate model-specific input features from input data such as text, images, conversations, etc.

abstract __call__(*, text: list[str], padding: bool, images: list[Image] | None = None, return_tensors: str | None = 'pt') BatchEncoding[source]#

Invokes the processor to extract features.

Parameters:
  • text – A list of text prompts.

  • padding – Whether to pad sequences to common length.

  • images – A list of input images.

  • return_tensors – The format of returned tensors.

Returns:

The model-specific input features.

Return type:

transformers.BatchEncoding

abstract apply_chat_template(conversation: list[Message], add_generation_prompt: bool = False) str[source]#

Applies a chat template.

Parameters:
  • conversation – A list of messages (conversation “turns”).

  • add_generation_prompt – Whether to append generation prompt to the output.

Returns:

A text prompt, which includes all input messages formatted into a string.

abstract property chat_template: str#

Returns a chat template.

abstract property image_processor: BaseImageProcessor | None#

Returns an image processor.

abstract property image_token: str | None#

Returns an image token.

abstract property image_token_id: int | None#

Returns an image token id.

abstract property label_ignore_index: int | None#

Returns a label ignore index.

abstract property processor_name: str#

Returns a processor name.

abstract save_config(output_dir: Path | str) None[source]#

Saves processor config to the directory.

abstract property tokenizer: PreTrainedTokenizerBase#

Returns a tokenizer associated with this processor.

oumi.core.processors.default_image_processor module#

class oumi.core.processors.default_image_processor.DefaultImageProcessor(worker_processor: Any)[source]#

Bases: BaseImageProcessor

Default implementation of image processor that wraps a callable object.

__call__(*, images: list[Image], return_tensors: str | None = 'pt') BatchFeature[source]#

Extracts image features.

Parameters:
  • images – A list of input images.

  • return_tensors – The format of returned tensors.

Returns:

The model-specific input features.

Return type:

transformers.BatchFeature

oumi.core.processors.default_processor module#

class oumi.core.processors.default_processor.DefaultProcessor(processor_name: str, worker_processor: Any, tokenizer: PreTrainedTokenizerBase, *, label_ignore_index: int | None)[source]#

Bases: BaseProcessor

Default implementation of processor that wraps a worker processor.

Validates that worker conforms to basic required invariants.

__call__(*, text: list[str], padding: bool, images: list[Image] | None = None, return_tensors: str | None = 'pt') BatchEncoding[source]#

Invokes the processor to extract features.

Parameters:
  • text – A list of text prompts.

  • padding – Whether to pad sequences to common length.

  • images – A list of input images.

  • return_tensors – The format of returned tensors.

Returns:

The model-specific input features.

Return type:

transformers.BatchEncoding

apply_chat_template(conversation: list[Message], add_generation_prompt: bool = False) str[source]#

Applies a chat template.

Parameters:
  • conversation – A list of messages (conversation “turns”).

  • add_generation_prompt – Whether to append generation prompt to the output.

Returns:

A text prompt, which includes all input messages formatted into a string.

property chat_template: str#

Returns a chat template.

property image_processor: BaseImageProcessor | None#

Returns an image processor.

property image_token: str | None#

Returns an image token.

property image_token_id: int | None#

Returns an image token id.

property label_ignore_index: int | None#

Returns a label ignore index.

property processor_name: str#

Returns a processor name.

save_config(output_dir: Path | str) None[source]#

Saves processor config to the directory.

property tokenizer: PreTrainedTokenizerBase#

Returns a tokenizer associated with this processor.