Source code for oumi.core.tokenizers.special_tokens
# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.fromtypingimportOptionalfromtransformersimportSpecialTokensMixinfromoumi.core.tokenizersimportBaseTokenizerfromoumi.utils.loggingimportlogger# Llama 3.1/3.2 models already have `<|finetune_right_pad_id|>` token in their vocab.LLAMA_SPECIAL_TOKENS_MIXIN=SpecialTokensMixin(pad_token="<|finetune_right_pad_id|>")special_tokens={"meta-llama/Llama-3.1-8B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-8B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-70B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-70B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-405B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-405B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-405B-FP8":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.1-405B-Instruct-FP8":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-8B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-8B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-70B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-70B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-405B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-405B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-405B-FP8":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Meta-Llama-3.1-405B-Instruct-FP8":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.2-1B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.2-1B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.2-3B":LLAMA_SPECIAL_TOKENS_MIXIN,"meta-llama/Llama-3.2-3B-Instruct":LLAMA_SPECIAL_TOKENS_MIXIN,}# Lowercase all keys for case-insensitive lookup.special_tokens={k.lower():vfork,vinspecial_tokens.items()}
[docs]defget_default_special_tokens(tokenizer:Optional[BaseTokenizer],)->SpecialTokensMixin:"""Returns the default special tokens for the tokenizer that was provided. Args: tokenizer: The tokenizer to get special tokens for. Returns: The special tokens mixin for the tokenizer. Description: This function looks up the special tokens for the provided tokenizer, for a list of known models. If the tokenizer is not recognized, it returns an empty special tokens mixin. This function is used as a fallback mechanism when a special token is required, but is not provided in the tokenizer's configuration. The primary use case for this is to retrieve the padding special token (`pad_token`), which is oftentimes not included in the tokenizer's configuration, even if it exists in the tokenizer's vocabulary. """iftokenizerandtokenizer.name_or_path:iftokenizer.name_or_path.lower()inspecial_tokens:returnspecial_tokens[tokenizer.name_or_path.lower()]else:logger.warning(f"Special tokens lookup for tokenizer {tokenizer.name_or_path} failed.")returnSpecialTokensMixin()