oumi.datasets.vision_language#

Vision-Language datasets module.

class oumi.datasets.vision_language.COCOCaptionsDataset(*, tokenizer: PreTrainedTokenizerBase | None = None, processor: BaseProcessor | None = None, processor_name: str | None = None, limit: int | None = None, trust_remote_code: bool = False, **kwargs)[source]#

Bases: VisionLanguageSftDataset

Dataset class for the HuggingFaceM4/COCO dataset.

dataset_name: str#
default_dataset: str | None = 'HuggingFaceM4/COCO'#
default_prompt = 'Describe this image:'#
transform_conversation(example: dict) Conversation[source]#

Transform a single conversation example into a Conversation object.

trust_remote_code: bool#
class oumi.datasets.vision_language.Flickr30kDataset(*, tokenizer: PreTrainedTokenizerBase | None = None, processor: BaseProcessor | None = None, processor_name: str | None = None, limit: int | None = None, trust_remote_code: bool = False, **kwargs)[source]#

Bases: VisionLanguageSftDataset

Dataset class for the nlphuji/flickr30k dataset.

dataset_name: str#
default_dataset: str | None = 'nlphuji/flickr30k'#
transform_conversation(example: dict) Conversation[source]#

Transform a single conversation example into a Conversation object.

trust_remote_code: bool#
class oumi.datasets.vision_language.LlavaInstructMixVsftDataset(*, tokenizer: PreTrainedTokenizerBase | None = None, processor: BaseProcessor | None = None, processor_name: str | None = None, limit: int | None = None, trust_remote_code: bool = False, **kwargs)[source]#

Bases: VisionLanguageSftDataset

Dataset class for the HuggingFaceH4/llava-instruct-mix-vsft dataset.

dataset_name: str#
default_dataset: str | None = 'HuggingFaceH4/llava-instruct-mix-vsft'#
transform_conversation(example: dict) Conversation[source]#

Transform a dataset example into a Conversation object.

trust_remote_code: bool#
class oumi.datasets.vision_language.MnistSftDataset(*, dataset_name: str | None = None, **kwargs)[source]#

Bases: VisionLanguageSftDataset

MNIST dataset formatted as SFT data.

MNIST is a well-known small dataset, can be useful for quick tests, prototyping, debugging.

dataset_name: str#
default_dataset: str | None = 'ylecun/mnist'#
transform_conversation(example: dict) Conversation[source]#

Transform a single MNIST example into a Conversation object.

trust_remote_code: bool#
class oumi.datasets.vision_language.VLJsonlinesDataset(dataset_path: str | Path | None = None, data: list | None = None, **kwargs)[source]#

Bases: VisionLanguageSftDataset

VLJsonlinesDataset for loading Vision-Language SFT data in Oumi format.

This dataset class is designed to work with JSON Lines (.jsonl) files containing Vision-Language supervised fine-tuning (SFT) data. It supports loading data either from a file or from a provided list of data samples.

Usage example:
Examples:
Loading from a file:
>>> from oumi.datasets import VLJsonlinesDataset
>>> dataset = VLJsonlinesDataset( 
...     dataset_path="/path/to/your/dataset.jsonl",
... )
Loading from a list of data samples:
>>> from oumi.builders import build_processor, build_tokenizer
>>> from oumi.core.configs import ModelParams
>>> from oumi.datasets import VLJsonlinesDataset
>>> data_samples = [
...     {
...         "messages": [
...             {
...                 "role": "user",
...                 "content": "Describe this image:",
...                 "type": "text"
...             },
...             {
...                 "role": "user",
...                 "content": "path/to/image.jpg",
...                 "type": "image_path"
...             },
...             {
...                 "role": "assistant",
...                 "content": "A scenic view of the puget sound.",
...                 "type": "text",
...             },
...         ]
...     }
... ]
>>> tokenizer = build_tokenizer(
...     ModelParams(model_name="Qwen/Qwen2-1.5B-Instruct")
... )
>>> dataset = VLJsonlinesDataset(
...     data=data_samples,
...     tokenizer=tokenizer,
...     processor_name="openai/clip-vit-base-patch32",
... )
dataset_name: str#
default_dataset: str | None = 'custom'#
transform_conversation(example: dict) Conversation[source]#

Transform a single conversation example into a Conversation object.

trust_remote_code: bool#
class oumi.datasets.vision_language.Vqav2SmallDataset(*, tokenizer: PreTrainedTokenizerBase | None = None, processor: BaseProcessor | None = None, processor_name: str | None = None, limit: int | None = None, trust_remote_code: bool = False, **kwargs)[source]#

Bases: VisionLanguageSftDataset

Dataset class for the merve/vqav2-small dataset.

dataset_name: str#
default_dataset: str | None = 'merve/vqav2-small'#
transform_conversation(example: dict) Conversation[source]#

Transform a single conversation example into a Conversation object.

trust_remote_code: bool#