Source code for oumi.datasets.vision_language.vision_jsonlines
# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.frompathlibimportPathfromtypingimportOptional,Unionimportpandasaspdfromtyping_extensionsimportoverridefromoumi.core.datasetsimportVisionLanguageSftDatasetfromoumi.core.registryimportregister_datasetfromoumi.core.types.conversationimportConversationfromoumi.utils.io_utilsimportload_jsonlines
[docs]@register_dataset("vl_sft")@register_dataset("vision_language_jsonl")classVLJsonlinesDataset(VisionLanguageSftDataset):"""VLJsonlinesDataset for loading Vision-Language SFT data in Oumi format. This dataset class is designed to work with JSON Lines (.jsonl) files containing Vision-Language supervised fine-tuning (SFT) data. It supports loading data either from a file or from a provided list of data samples. Usage example: Examples: Loading from a file: >>> from oumi.datasets import VLJsonlinesDataset >>> dataset = VLJsonlinesDataset( # doctest: +SKIP ... dataset_path="/path/to/your/dataset.jsonl", ... ) Loading from a list of data samples: >>> from oumi.builders import build_processor, build_tokenizer >>> from oumi.core.configs import ModelParams >>> from oumi.datasets import VLJsonlinesDataset >>> data_samples = [ ... { ... "messages": [ ... { ... "role": "user", ... "content": "Describe this image:", ... "type": "text" ... }, ... { ... "role": "user", ... "content": "path/to/image.jpg", ... "type": "image_path" ... }, ... { ... "role": "assistant", ... "content": "A scenic view of the puget sound.", ... "type": "text", ... }, ... ] ... } ... ] >>> tokenizer = build_tokenizer( ... ModelParams(model_name="Qwen/Qwen2-1.5B-Instruct") ... ) >>> dataset = VLJsonlinesDataset( ... data=data_samples, ... tokenizer=tokenizer, ... processor_name="openai/clip-vit-base-patch32", ... ) """default_dataset="custom"def__init__(self,dataset_path:Optional[Union[str,Path]]=None,data:Optional[list]=None,**kwargs,):"""Initializes a new instance of the VLJsonlinesDataset class."""ifdataset_pathisnotNoneanddataisnotNone:raiseValueError("Either dataset_path or data must be provided, but not both")self._data_column:str="_messages_column"self._dataset_path:Optional[Path]=(Path(dataset_path)ifdataset_pathelseNone)ifdataisnotNone:data_frame=pd.DataFrame({self._data_column:data})elifself._dataset_pathisnotNone:data=load_jsonlines(self._dataset_path)data_frame=pd.DataFrame({self._data_column:data})else:raiseValueError("Dataset path or data must be provided")assertdata_frameisnotNoneself._data:pd.DataFrame=data_framesuper().__init__(**kwargs)@overridedef_load_data(self)->pd.DataFrame:# no-op, data is already loaded in __init__returnself._data
[docs]@overridedeftransform_conversation(self,example:dict)->Conversation:"""Transform a single conversation example into a Conversation object."""messages=example[self._data_column]returnConversation.from_dict(messages)