# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importfunctoolsimportosfrompathlibimportPathfromtypingimportOptional,Unionimporttransformersfromoumi.core.configs.internal.supported_modelsimport(is_custom_model,)fromoumi.utils.loggingimportlogger
[docs]defis_cached_to_disk_hf_dataset(dataset_folder:Union[str,Path])->bool:"""Detects whether a dataset was saved using `dataset.save_to_disk()`. Such datasets should be loaded using `datasets.Dataset.load_from_disk()` Returns: Whether the dataset was saved using `dataset.save_to_disk()` method. """ifnotdataset_folder:returnFalsedataset_path:Path=Path(dataset_folder)ifdataset_path.exists()anddataset_path.is_dir():forfile_namein("dataset_info.json","state.json"):file_path:Path=dataset_path/file_nameifnot(file_path.exists()andfile_path.is_file()):logger.warning(f"The dataset {str(dataset_path)} is missing "f"a required file: {file_name}.")returnFalsereturnTruereturnFalse
[docs]deffind_hf_token()->Optional[str]:"""Attempts to find HuggingFace access token. Returns: A valid HF access token, or `None` if not found. """hf_token=os.environ.get("HF_TOKEN",None)ifnothf_token:_DEFAULT_HF_HOME_PATH="~/.cache/huggingface"file_must_exist=Falsetoken_path=os.environ.get("HF_TOKEN_PATH",None)iftoken_path:token_path=Path(token_path)file_must_exist=Trueelse:hf_home_dir=Path(os.environ.get("HF_HOME",_DEFAULT_HF_HOME_PATH)or_DEFAULT_HF_HOME_PATH)token_path=hf_home_dir/"token"iftoken_path.exists():iftoken_path.is_file():hf_token=token_path.read_text().strip()eliffile_must_exist:raiseFileNotFoundError(f"Missing HF token file: '{token_path}'")returnhf_tokenifhf_tokenelseNone
[docs]@functools.cachedefget_hf_chat_template(tokenizer_name:str,*,trust_remote_code:bool=False)->Optional[str]:"""Returns chat template provided by HF for `tokenizer_name`."""ifnottokenizer_nameoris_custom_model(tokenizer_name):returnNonetokenizer=transformers.AutoTokenizer.from_pretrained(tokenizer_name,trust_remote_code=trust_remote_code)iftokenizer.chat_template:ifnotisinstance(tokenizer.chat_template,str):raiseRuntimeError(f"Chat template for tokenizer_name: {tokenizer_name} ""is not a string! "f"Actual type: {type(tokenizer.chat_template)}")returntokenizer.chat_templatereturnNone