# Copyright 2025 - Oumi## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.importfunctoolsfromtypingimportAny,Optionalimporttransformersimportoumi.core.constantsasconstantsfromoumi.core.configs.internal.supported_modelsimport(find_internal_model_config_using_model_name,)fromoumi.core.processors.base_processorimportBaseProcessorfromoumi.core.processors.default_processorimportDefaultProcessorfromoumi.core.tokenizers.base_tokenizerimportBaseTokenizer
[docs]defbuild_processor(processor_name:str,tokenizer:BaseTokenizer,*,processor_kwargs:Optional[dict[str,Any]]=None,trust_remote_code:bool=False,)->BaseProcessor:"""Builds a processor. Args: processor_name: A name of the processor (usually, equals to a model name). tokenizer: A tokenizer to use with the processor. processor_kwargs: A dictionary of processor-specific parameters. These parameters are passed to the processor constructor. They can override model-specific parameters. trust_remote_code: Whether to allow loading remote code for this processor Some processors come with downloadable executable Python files, which can be a potential security risk, unless it's from a trusted source. Returns: BaseProcessor: The newly created processor. """ifnotprocessor_name:raiseValueError("Empty model name.")model_config=find_internal_model_config_using_model_name(processor_name,trust_remote_code=trust_remote_code)# Initialize model-specific params.label_ignore_index:Optional[int]=constants.LABEL_IGNORE_INDEXignore_features:Optional[list[str]]=Noneeffective_processor_kwargs={}ifmodel_configisnotNone:label_ignore_index=model_config.label_ignore_indexignore_features=model_config.ignore_featureseffective_processor_kwargs.update(model_config.processor_kwargs)ifprocessor_kwargsisnotNoneandlen(processor_kwargs)>0:# Override model-specific params with user-defined ones.effective_processor_kwargs.update(processor_kwargs)create_processor_fn=functools.partial(transformers.AutoProcessor.from_pretrained,processor_name,trust_remote_code=trust_remote_code,)iflen(effective_processor_kwargs)>0:worker_processor=create_processor_fn(**effective_processor_kwargs)else:worker_processor=create_processor_fn()returnDefaultProcessor(processor_name,worker_processor,tokenizer,label_ignore_index=label_ignore_index,ignore_features=ignore_features,)