# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Generic class for using HuggingFace datasets with messages column.Allows users to specify the messages column at the config level."""fromtypingimportUnionimportpandasaspdfromoumi.core.datasetsimportBaseSftDatasetfromoumi.core.registryimportregister_datasetfromoumi.core.types.conversationimportConversation,Message,Role
[docs]@register_dataset("HuggingFaceDataset")classHuggingFaceDataset(BaseSftDataset):"""Converts HuggingFace Datasets with messages to Oumi Message format. Example: dataset = HuggingFaceDataset( hf_dataset_path="oumi-ai/oumi-synthetic-document-claims", message_column="messages" ) """def__init__(self,*,hf_dataset_path:str="",messages_column:str="messages",exclude_final_assistant_message:bool=False,**kwargs,)->None:"""Initializes a new instance of the OumiDataset class."""ifnothf_dataset_path:raiseValueError("The `hf_dataset_path` parameter must be provided.")ifnotmessages_column:raiseValueError("The `messages_column` parameter must be provided.")self.messages_column=messages_columnself.exclude_final_assistant_message=exclude_final_assistant_messagekwargs["dataset_name"]=hf_dataset_pathsuper().__init__(**kwargs)
[docs]deftransform_conversation(self,example:Union[dict,pd.Series])->Conversation:"""Preprocesses the inputs of the example and returns a dictionary. Args: example: An example containing `messages` entries. Returns: Conversation: A Conversation object containing the messages. """messages=[]ifself.messages_columnnotinexample:raiseValueError(f"The column '{self.messages_column}' is not present in the example.")example_messages=example[self.messages_column]formessageinexample_messages:if"role"notinmessageor"content"notinmessage:raiseValueError("The message format is invalid. Expected keys: 'role', 'content'.")ifmessage["role"]=="user":role=Role.USERelifmessage["role"]=="assistant":role=Role.ASSISTANTelse:raiseValueError(f"Invalid role '{message['role']}'. Expected 'user' or 'assistant'.")content=message["content"]or""messages.append(Message(role=role,content=content))ifself.exclude_final_assistant_messageandmessages[-1].role==Role.ASSISTANT:messages=messages[:-1]returnConversation(messages=messages)