# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.fromtypingimportUnionimportpandasaspdfromoumi.core.datasetsimportBaseSftDatasetfromoumi.core.registryimportregister_datasetfromoumi.core.types.conversationimportConversation,Message,Role
[docs]@register_dataset("argilla/magpie-ultra-v0.1")classArgillaMagpieUltraDataset(BaseSftDataset):"""Dataset class for the argilla/magpie-ultra-v0.1 dataset."""default_dataset="argilla/magpie-ultra-v0.1"
[docs]deftransform_conversation(self,example:Union[dict,pd.Series])->Conversation:"""Transform a dataset example into a Conversation object."""instruction:str=example.get("instruction",None)or""response:str=example.get("response",None)or""messages=[Message(role=Role.USER,content=instruction),Message(role=Role.ASSISTANT,content=response),]returnConversation(messages=messages)
[docs]@register_dataset("Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1")@register_dataset("Magpie-Align/Magpie-Pro-300K-Filtered")classMagpieProDataset(BaseSftDataset):"""Dataset class for the Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1 dataset."""default_dataset="Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1"
[docs]deftransform_conversation(self,example:Union[dict,pd.Series])->Conversation:"""Transform a dataset example into a Conversation object."""conversation=example.get("conversations")ifconversationisNone:raiseValueError("Conversation is None")messages=[]formessageinconversation:ifmessage["from"]=="human":role=Role.USERelifmessage["from"]=="gpt":role=Role.ASSISTANTelse:raiseValueError(f"Unknown role: {message['from']}")content=message.get("value","")messages.append(Message(role=role,content=content))returnConversation(messages=messages)