Source code for oumi.datasets.vision_language.vision_jsonlines

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from typing import Optional, Union

import pandas as pd
from typing_extensions import override

from oumi.core.datasets import VisionLanguageSftDataset
from oumi.core.registry import register_dataset
from oumi.core.types.conversation import Conversation
from oumi.utils.io_utils import load_jsonlines


[docs] @register_dataset("vl_sft") @register_dataset("vision_language_jsonl") class VLJsonlinesDataset(VisionLanguageSftDataset): """VLJsonlinesDataset for loading Vision-Language SFT data in Oumi format. This dataset class is designed to work with JSON Lines (.jsonl) files containing Vision-Language supervised fine-tuning (SFT) data. It supports loading data either from a file or from a provided list of data samples. Usage example: Examples: Loading from a file: >>> from oumi.datasets import VLJsonlinesDataset >>> dataset = VLJsonlinesDataset( # doctest: +SKIP ... dataset_path="/path/to/your/dataset.jsonl", ... ) Loading from a list of data samples: >>> from oumi.builders import build_processor, build_tokenizer >>> from oumi.core.configs import ModelParams >>> from oumi.datasets import VLJsonlinesDataset >>> data_samples = [ ... { ... "messages": [ ... { ... "role": "user", ... "content": "Describe this image:", ... "type": "text" ... }, ... { ... "role": "user", ... "content": "path/to/image.jpg", ... "type": "image_path" ... }, ... { ... "role": "assistant", ... "content": "A scenic view of the puget sound.", ... "type": "text", ... }, ... ] ... } ... ] >>> tokenizer = build_tokenizer( ... ModelParams(model_name="Qwen/Qwen2-1.5B-Instruct") ... ) >>> dataset = VLJsonlinesDataset( ... data=data_samples, ... tokenizer=tokenizer, ... processor_name="openai/clip-vit-base-patch32", ... ) """ default_dataset = "custom" def __init__( self, dataset_path: Optional[Union[str, Path]] = None, data: Optional[list] = None, **kwargs, ): """Initializes a new instance of the VLJsonlinesDataset class.""" if dataset_path is not None and data is not None: raise ValueError( "Either dataset_path or data must be provided, but not both" ) self._data_column: str = "_messages_column" self._dataset_path: Optional[Path] = ( Path(dataset_path) if dataset_path else None ) if data is not None: data_frame = pd.DataFrame({self._data_column: data}) elif self._dataset_path is not None: data = load_jsonlines(self._dataset_path) data_frame = pd.DataFrame({self._data_column: data}) else: raise ValueError("Dataset path or data must be provided") assert data_frame is not None self._data: pd.DataFrame = data_frame super().__init__(**kwargs) @override def _load_data(self) -> pd.DataFrame: # no-op, data is already loaded in __init__ return self._data
[docs] @override def transform_conversation(self, example: dict) -> Conversation: """Transform a single conversation example into a Conversation object.""" messages = example[self._data_column] return Conversation.from_dict(messages)