oumi.datasets.sft

oumi.datasets.sft#

Supervised fine-tuning datasets module.

class oumi.datasets.sft.AlpacaDataset(*, include_system_prompt: bool = True, **kwargs)[source]#

Bases: BaseSftDataset

dataset_name: str#

default_dataset: str | None = 'tatsu-lab/alpaca'#

system_prompt_with_context = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.'#

system_prompt_without_context = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'#

transform_conversation(example: dict | Series) → Conversation[source]#

Preprocesses the inputs of the example and returns a dictionary.

Parameters:: example (dict or Pandas Series) – An example containing input (optional), instruction, and output entries.
Returns:: The input example converted to Alpaca dictionary format.
Return type:: dict

trust_remote_code: bool#

class oumi.datasets.sft.ArgillaDollyDataset(*, use_new_fields: bool = True, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the Databricks Dolly 15k curated dataset.

dataset_name: str#

default_dataset: str | None = 'argilla/databricks-dolly-15k-curated-en'#

transform_conversation(example: dict | Series) → Conversation[source]#

Transform a dataset example into a Conversation object.

Parameters:: example – A single example from the dataset.
Returns:: A Conversation object containing the transformed messages.
Return type:: Conversation

trust_remote_code: bool#

class oumi.datasets.sft.ArgillaMagpieUltraDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the argilla/magpie-ultra-v0.1 dataset.

dataset_name: str#

default_dataset: str | None = 'argilla/magpie-ultra-v0.1'#

transform_conversation(example: dict | Series) → Conversation[source]#: Transform a dataset example into a Conversation object.

trust_remote_code: bool#

class oumi.datasets.sft.AyaDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the CohereForAI/aya_dataset dataset.

dataset_name: str#

default_dataset: str | None = 'CohereForAI/aya_dataset'#

transform_conversation(example: dict | Series) → Conversation[source]#: Transform a dataset example into a Conversation object.

trust_remote_code: bool#

class oumi.datasets.sft.ChatRAGBenchDataset(*, split: str = 'test', task: str = 'generation', subset: str | None = None, num_context_docs: int = 5, **kwargs)[source]#

Bases: BaseSftDataset

dataset_name: str#

default_dataset: str = 'nvidia/ChatRAG-Bench'#

default_subset: str = 'doc2dial'#

default_system_message: str = "This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context."#

transform_conversation(example: dict | Series) → Conversation[source]#

Transforms a given example into a Conversation object.

Parameters:: example (Union[dict, pd.Series]) – The example to transform.
Returns:: The transformed Conversation object.
Return type:: Conversation

trust_remote_code: bool#

class oumi.datasets.sft.ChatqaDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

dataset_name: str#

default_dataset: str | None = 'nvidia/ChatQA-Training-Data'#

default_subset: str | None = 'sft'#

transform_conversation(raw_conversation: dict | Series) → Conversation[source]#

Preprocesses the inputs of the example and returns a dictionary.

ChatQA is a conversational question answering dataset. It contains 10 subsets. Some subsets contain grounding documents.

See the dataset page for more information: https://huggingface.co/datasets/nvidia/ChatQA-Training-Data

Parameters:: raw_conversation – The raw conversation example.
Returns:: The preprocessed inputs as an Oumi conversation.
Return type:: dict

trust_remote_code: bool#

class oumi.datasets.sft.ChatqaTatqaDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: ChatqaDataset

ChatQA Subclass to handle tatqa subsets.

The tatqa subsets require loading a specific file from the dataset repository, thus requiring us to override the default loading behavior.

dataset_name: str#

default_subset: str | None = 'tatqa-arithmetic'#

trust_remote_code: bool#

class oumi.datasets.sft.CoALMDataset(*, include_system_prompt: bool = True, **kwargs)[source]#

Bases: AlpacaDataset

Dataset class for the UIUC CoALM dataset.

This dataset follows the same structure as the Alpaca dataset, with instruction, input, and output fields. It is designed for training Conversational Agentic Language Models (CoALM) that can handle both task-oriented dialogue and function calling.

Dataset Sources:

Paper: https://arxiv.org/abs/2502.08820
Project Page: https://emrecanacikgoz.github.io/CoALM/
Repository: oumi-ai/oumi
Dataset: https://huggingface.co/datasets/uiuc-convai/CoALM-IT

Examples

>>> from oumi.datasets import CoALMDataset
>>> dataset = CoALMDataset()
>>> # The dataset will be loaded from HuggingFace with the path
>>> # "uiuc-convai/CoALM-IT" and transformed into the Oumi
>>> # conversation format automatically.

dataset_name: str#

default_dataset: str | None = 'uiuc-convai/CoALM-IT'#

trust_remote_code: bool#

class oumi.datasets.sft.HuggingFaceDataset(*, hf_dataset_path: str = '', messages_column: str = 'messages', exclude_final_assistant_message: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Converts HuggingFace Datasets with messages to Oumi Message format.

Example

dataset = HuggingFaceDataset(: hf_dataset_path=”oumi-ai/oumi-synthetic-document-claims”, message_column=”messages”

)

dataset_name: str#

transform_conversation(example: dict | Series) → Conversation[source]#

Preprocesses the inputs of the example and returns a dictionary.

Parameters:: example – An example containing messages entries.
Returns:: A Conversation object containing the messages.
Return type:: Conversation

trust_remote_code: bool#

class oumi.datasets.sft.MagpieProDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1 dataset.

dataset_name: str#

default_dataset: str | None = 'Magpie-Align/Llama-3-Magpie-Pro-1M-v0.1'#

transform_conversation(example: dict | Series) → Conversation[source]#: Transform a dataset example into a Conversation object.

trust_remote_code: bool#

class oumi.datasets.sft.OpenO1SFTDataset(**kwargs)[source]#

Bases: PromptResponseDataset

Synthetic reasoning SFT dataset.

dataset_name: str#

default_dataset: str | None = 'O1-OPEN/OpenO1-SFT'#

trust_remote_code: bool#

class oumi.datasets.sft.PromptResponseDataset(*, hf_dataset_path: str = 'O1-OPEN/OpenO1-SFT', prompt_column: str = 'instruction', response_column: str = 'output', **kwargs)[source]#

Bases: BaseSftDataset

Converts HuggingFace Datasets with input/output columns to Message format.

Example

dataset = PromptResponseDataset(hf_dataset_path=”O1-OPEN/OpenO1-SFT”, prompt_column=”instruction”, response_column=”output”)

dataset_name: str#

default_dataset: str | None = 'O1-OPEN/OpenO1-SFT'#

transform_conversation(example: dict | Series) → Conversation[source]#

Preprocesses the inputs of the example and returns a dictionary.

Parameters:: example (dict or Pandas Series) – An example containing input (optional), instruction, and output entries.
Returns:: The input example converted to messages dictionary format.
Return type:: dict

trust_remote_code: bool#

class oumi.datasets.sft.TextSftJsonLinesDataset(dataset_path: str | Path | None = None, data: list[dict[str, Any]] | None = None, format: str | None = None, **kwargs)[source]#

Bases: BaseSftDataset

TextSftJsonLinesDataset for loading SFT data in oumi and alpaca formats.

This dataset class is designed to work with JSON Lines (.jsonl) or JSON (.json) files containing text-based supervised fine-tuning (SFT) data. It supports loading data either from a file or from a provided list of data samples in oumi and alpaca formats.

Supported formats: 1. JSONL or JSON of conversations (Oumi format) 2. JSONL or JSON of Alpaca-style turns (instruction, input, output)

Parameters:

dataset_path (Optional[Union[str, Path]]) – Path to the dataset file (.jsonl or .json).
data (Optional[List[Dict[str, Any]]]) – List of conversation dicts if not loading from a file.
format (Optional[str]) – The format of the data. Either “conversations” or “alpaca”. If not provided, the format will be auto-detected.
**kwargs – Additional arguments to pass to the parent class.

Examples

Loading conversations from a JSONL file with auto-detection:

>>> from oumi.datasets import TextSftJsonLinesDataset
>>> dataset = TextSftJsonLinesDataset(
...     dataset_path="/path/to/your/dataset.jsonl"
... )

Loading Alpaca-style data from a JSON file:

>>> from oumi.datasets import TextSftJsonLinesDataset
>>> dataset = TextSftJsonLinesDataset(
...     dataset_path="/path/to/your/dataset.json",
...     format="alpaca"
... )

Loading from a list of data samples:

>>> from oumi.datasets import TextSftJsonLinesDataset
>>> data_samples = [
...     {"messages": [{"role": "user", "content": "Hello"},
...                   {"role": "assistant", "content": "Hi there!"}]},
...     {"messages": [{"role": "user", "content": "How are you?"},
...                   {"role": "assistant", "content": "great!"}]}
... ]
>>> dataset = TextSftJsonLinesDataset(
...     data=data_samples,
... )

dataset_name: str#

default_dataset: str | None = 'custom'#

transform_conversation(example: dict) → Conversation[source]#

Transform a single conversation example into a Conversation object.

Parameters:: example – The input example containing the messages or Alpaca-style turn.
Returns:: A Conversation object containing the messages.
Return type:: Conversation

trust_remote_code: bool#

class oumi.datasets.sft.Tulu3MixtureDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

dataset_name: str#

default_dataset: str | None = 'allenai/tulu-3-sft-mixture'#

transform_conversation(example: dict | Series) → Conversation[source]#

Convert the example into a Conversation.

Parameters:: example (dict or Pandas Series) – An example containing a messages field which is a list of dicts with content and role string fields

trust_remote_code: bool#

class oumi.datasets.sft.UltrachatH4Dataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the HuggingFaceH4/ultrachat_200k dataset.

dataset_name: str#

default_dataset: str | None = 'HuggingFaceH4/ultrachat_200k'#

transform_conversation(example: dict | Series) → Conversation[source]#: Transform a dataset example into a Conversation object.

trust_remote_code: bool#

class oumi.datasets.sft.WildChatDataset(*, dataset_name: str | None = None, dataset_path: str | None = None, split: str | None = None, tokenizer: PreTrainedTokenizerBase | None = None, task: Literal['sft', 'generation', 'auto'] = 'auto', return_tensors: bool = False, text_col: str = 'text', assistant_only: bool = False, response_template: str | None = None, instruction_template: str | None = None, return_conversations: bool = False, **kwargs)[source]#

Bases: BaseSftDataset

Dataset class for the allenai/WildChat-1M dataset.

dataset_name: str#

default_dataset: str | None = 'allenai/WildChat-1M'#

transform_conversation(example: dict | Series) → Conversation[source]#: Transform a dataset example into a Conversation object.

trust_remote_code: bool#

oumi.datasets.sft

Contents

oumi.datasets.sft#