Source code for oumi.core.tokenizers.special_tokens

# Copyright 2025 - Oumi
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from transformers import SpecialTokensMixin

from oumi.core.tokenizers import BaseTokenizer
from oumi.utils.logging import logger

# Llama 3.1/3.2 models already have `<|finetune_right_pad_id|>` token in their vocab.
LLAMA_SPECIAL_TOKENS_MIXIN = SpecialTokensMixin(pad_token="<|finetune_right_pad_id|>")

special_tokens = {
    "meta-llama/Llama-3.1-8B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-8B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-70B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-70B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-405B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-405B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-405B-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.1-405B-Instruct-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-8B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-8B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-70B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-70B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-405B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-405B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-405B-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Meta-Llama-3.1-405B-Instruct-FP8": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.2-1B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.2-1B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.2-3B": LLAMA_SPECIAL_TOKENS_MIXIN,
    "meta-llama/Llama-3.2-3B-Instruct": LLAMA_SPECIAL_TOKENS_MIXIN,
}

# Lowercase all keys for case-insensitive lookup.
special_tokens = {k.lower(): v for k, v in special_tokens.items()}



[docs]
def get_default_special_tokens(
    tokenizer: Optional[BaseTokenizer],
) -> SpecialTokensMixin:
    """Returns the default special tokens for the tokenizer that was provided.

    Args:
        tokenizer: The tokenizer to get special tokens for.

    Returns:
        The special tokens mixin for the tokenizer.

    Description:
        This function looks up the special tokens for the provided tokenizer, for a list
        of known models. If the tokenizer is not recognized, it returns an empty special
        tokens mixin. This function is used as a fallback mechanism when a special token
        is required, but is not provided in the tokenizer's configuration. The primary
        use case for this is to retrieve the padding special token (`pad_token`), which
        is oftentimes not included in the tokenizer's configuration, even if it exists
        in the tokenizer's vocabulary.
    """
    if tokenizer and tokenizer.name_or_path:
        if tokenizer.name_or_path.lower() in special_tokens:
            return special_tokens[tokenizer.name_or_path.lower()]
        else:
            logger.warning(
                f"Special tokens lookup for tokenizer {tokenizer.name_or_path} failed."
            )
    return SpecialTokensMixin()