You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

780 lines
41 KiB

# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Auto Tokenizer class."""
import importlib
import json
import os
from collections import OrderedDict
from typing import Any
from transformers.utils.import_utils import is_mistral_common_available
from ...configuration_utils import PreTrainedConfig
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
from ...utils import (
extract_commit_hash,
is_g2p_en_available,
is_sentencepiece_available,
is_tokenizers_available,
logging,
)
from ...utils.hub import cached_file
from ..encoder_decoder import EncoderDecoderConfig
from .auto_factory import _LazyAutoMapping
from .configuration_auto import (
CONFIG_MAPPING_NAMES,
AutoConfig,
config_class_to_model_type,
model_type_to_module_name,
replace_list_option_in_docstrings,
)
if is_tokenizers_available():
from ...tokenization_utils_tokenizers import TokenizersBackend
else:
TokenizersBackend = None
if is_sentencepiece_available():
from ...tokenization_utils_sentencepiece import SentencePieceBackend
else:
SentencePieceBackend = None
logger = logging.get_logger(__name__)
# V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based)
REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {}
REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {}
TOKENIZER_MAPPING_NAMES = OrderedDict[str, str | None](
[
("aimv2", "CLIPTokenizer" if is_tokenizers_available() else None),
("albert", "AlbertTokenizer" if is_tokenizers_available() else None),
("align", "BertTokenizer" if is_tokenizers_available() else None),
("audioflamingo3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None),
("bark", "BertTokenizer" if is_tokenizers_available() else None),
("bart", "RobertaTokenizer" if is_tokenizers_available() else None),
("barthez", "BarthezTokenizer" if is_tokenizers_available() else None),
("bartpho", "BartphoTokenizer"),
("bert", "BertTokenizer" if is_tokenizers_available() else None),
("bert-generation", "BertGenerationTokenizer" if is_sentencepiece_available() else None),
("bert-japanese", "BertJapaneseTokenizer"),
("bertweet", "BertweetTokenizer"),
("big_bird", "BigBirdTokenizer" if is_tokenizers_available() else None),
("bigbird_pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
("biogpt", "BioGptTokenizer"),
("blenderbot", "BlenderbotTokenizer" if is_tokenizers_available() else None),
("blenderbot-small", "BlenderbotSmallTokenizer"),
("blip", "BertTokenizer" if is_tokenizers_available() else None),
("blip-2", "GPT2Tokenizer" if is_tokenizers_available() else None),
("bridgetower", "RobertaTokenizer"),
("bros", "BertTokenizer" if is_tokenizers_available() else None),
("byt5", "ByT5Tokenizer"),
("camembert", "CamembertTokenizer" if is_tokenizers_available() else None),
("canine", "CanineTokenizer"),
("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None),
("clap", "RobertaTokenizer"),
("clip", "CLIPTokenizer" if is_tokenizers_available() else None),
("clipseg", "CLIPTokenizer" if is_tokenizers_available() else None),
("clvp", "ClvpTokenizer"),
("code_llama", "CodeLlamaTokenizer" if is_tokenizers_available() else None),
("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None),
("cohere", "CohereTokenizer" if is_tokenizers_available() else None),
("cohere2", "CohereTokenizer" if is_tokenizers_available() else None),
("colqwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("convbert", "BertTokenizer" if is_tokenizers_available() else None),
("cpm", "CpmTokenizer" if is_tokenizers_available() else None),
("cpmant", "CpmAntTokenizer"),
("ctrl", "CTRLTokenizer"),
("data2vec-audio", "Wav2Vec2CTCTokenizer"),
("data2vec-text", "RobertaTokenizer"),
("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None),
("deberta", "DebertaTokenizer" if is_tokenizers_available() else None),
("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None),
("dia", "DiaTokenizer"),
("distilbert", "BertTokenizer" if is_tokenizers_available() else None),
("dpr", "DPRQuestionEncoderTokenizer" if is_tokenizers_available() else None),
("electra", "BertTokenizer" if is_tokenizers_available() else None),
("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None),
("ernie", "BertTokenizer" if is_tokenizers_available() else None),
("esm", "EsmTokenizer"),
("falcon_mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None),
("flaubert", "FlaubertTokenizer"),
("flava", "BertTokenizer" if is_tokenizers_available() else None),
("flex_olmo", "GPT2Tokenizer" if is_tokenizers_available() else None),
("florence2", "BartTokenizer" if is_tokenizers_available() else None),
("fnet", "FNetTokenizer" if is_tokenizers_available() else None),
("fsmt", "FSMTTokenizer"),
("funnel", "FunnelTokenizer" if is_tokenizers_available() else None),
("gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma2", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma3", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma3_text", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma3n", "GemmaTokenizer" if is_tokenizers_available() else None),
("gemma3n_text", "GemmaTokenizer" if is_tokenizers_available() else None),
("git", "BertTokenizer" if is_tokenizers_available() else None),
("glm", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4_moe", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4_moe_lite", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4v", "TokenizersBackend" if is_tokenizers_available() else None),
("glm4v_moe", "TokenizersBackend" if is_tokenizers_available() else None),
("glm_image", "TokenizersBackend" if is_tokenizers_available() else None),
("glmasr", "TokenizersBackend" if is_tokenizers_available() else None),
("got_ocr2", "TokenizersBackend" if is_tokenizers_available() else None),
("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None),
("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None),
("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None),
("gpt_neo", "GPT2Tokenizer" if is_tokenizers_available() else None),
("gpt_neox", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("gpt_neox_japanese", "GPTNeoXJapaneseTokenizer"),
("gptj", "GPT2Tokenizer" if is_tokenizers_available() else None),
("granite", "GPT2Tokenizer"),
("granitemoe", "GPT2Tokenizer"),
("granitemoehybrid", "GPT2Tokenizer"),
("granitemoeshared", "GPT2Tokenizer"),
("grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
("groupvit", "CLIPTokenizer" if is_tokenizers_available() else None),
("herbert", "HerbertTokenizer" if is_tokenizers_available() else None),
("hubert", "Wav2Vec2CTCTokenizer"),
("ibert", "RobertaTokenizer"),
("idefics", "LlamaTokenizer" if is_tokenizers_available() else None),
("idefics2", "LlamaTokenizer" if is_tokenizers_available() else None),
("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None),
("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None),
("internvl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("jais2", "GPT2Tokenizer" if is_tokenizers_available() else None),
("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("lasr_ctc", "ParakeetTokenizer" if is_tokenizers_available() else None),
("lasr_encoder", "ParakeetTokenizer" if is_tokenizers_available() else None),
("layoutlm", "BertTokenizer" if is_tokenizers_available() else None),
("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None),
("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None),
("led", "LEDTokenizer" if is_tokenizers_available() else None),
("lighton_ocr", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
("luke", "LukeTokenizer"),
("lxmert", "LxmertTokenizer" if is_tokenizers_available() else None),
("m2m_100", "M2M100Tokenizer" if is_sentencepiece_available() else None),
("mamba", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("mamba2", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("marian", "MarianTokenizer" if is_sentencepiece_available() else None),
("markuplm", "MarkupLMTokenizer" if is_tokenizers_available() else None),
("mbart", "MBartTokenizer" if is_tokenizers_available() else None),
("mbart50", "MBart50Tokenizer" if is_tokenizers_available() else None),
("mega", "RobertaTokenizer"),
("megatron-bert", "BertTokenizer" if is_tokenizers_available() else None),
("metaclip_2", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("mgp-str", "MgpstrTokenizer"),
(
"ministral3",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
(
"mistral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
(
"mistral3",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
(
"mixtral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None),
("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None),
("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None),
("mpnet", "MPNetTokenizer" if is_tokenizers_available() else None),
("mpt", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("mra", "RobertaTokenizer"),
("mt5", "T5Tokenizer" if is_tokenizers_available() else None),
("musicgen", "T5Tokenizer" if is_tokenizers_available() else None),
("musicgen_melody", "T5Tokenizer" if is_tokenizers_available() else None),
("mvp", "MvpTokenizer" if is_tokenizers_available() else None),
("myt5", "MyT5Tokenizer"),
("nezha", "BertTokenizer" if is_tokenizers_available() else None),
("nllb", "NllbTokenizer" if is_tokenizers_available() else None),
("nllb-moe", "NllbTokenizer" if is_tokenizers_available() else None),
("nougat", "NougatTokenizer" if is_tokenizers_available() else None),
("nystromformer", "AlbertTokenizer" if is_tokenizers_available() else None),
("olmo", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("olmo2", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("olmo3", "GPT2Tokenizer" if is_tokenizers_available() else None),
("olmoe", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("omdet-turbo", "CLIPTokenizer" if is_tokenizers_available() else None),
("oneformer", "CLIPTokenizer" if is_tokenizers_available() else None),
("openai-gpt", "OpenAIGPTTokenizer" if is_tokenizers_available() else None),
("opt", "GPT2Tokenizer" if is_tokenizers_available() else None),
("ovis2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("owlv2", "CLIPTokenizer" if is_tokenizers_available() else None),
("owlvit", "CLIPTokenizer" if is_tokenizers_available() else None),
("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None),
("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None),
("perceiver", "PerceiverTokenizer"),
("phi", "GPT2Tokenizer" if is_tokenizers_available() else None),
("phobert", "PhobertTokenizer"),
("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None),
(
"pixtral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
("plbart", "PLBartTokenizer" if is_tokenizers_available() else None),
("prophetnet", "ProphetNetTokenizer"),
("qdqbert", "BertTokenizer" if is_tokenizers_available() else None),
("qwen2", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen2_5_omni", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen2_5_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen2_audio", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen2_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen2_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3_next", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3_omni_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3_vl", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("qwen3_vl_moe", "Qwen2Tokenizer" if is_tokenizers_available() else None),
("rag", "RagTokenizer"),
("realm", "BertTokenizer" if is_tokenizers_available() else None),
("recurrent_gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
("reformer", "ReformerTokenizer" if is_tokenizers_available() else None),
("rembert", "RemBertTokenizer" if is_tokenizers_available() else None),
("retribert", "BertTokenizer" if is_tokenizers_available() else None),
("roberta", "RobertaTokenizer"),
("roberta-prelayernorm", "RobertaTokenizer"),
("roc_bert", "RoCBertTokenizer"),
("roformer", "RoFormerTokenizer" if is_tokenizers_available() else None),
("rwkv", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("sam3", "CLIPTokenizer" if is_tokenizers_available() else None),
("sam3_video", "CLIPTokenizer" if is_tokenizers_available() else None),
("seamless_m4t", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None),
("shieldgemma2", "GemmaTokenizer" if is_tokenizers_available() else None),
("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None),
("siglip2", "Siglip2Tokenizer" if is_tokenizers_available() else None),
("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None),
("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None),
("splinter", "SplinterTokenizer"),
("squeezebert", "BertTokenizer" if is_tokenizers_available() else None),
("stablelm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("starcoder2", "GPT2Tokenizer" if is_tokenizers_available() else None),
("switch_transformers", "T5Tokenizer" if is_tokenizers_available() else None),
("t5", "T5Tokenizer" if is_tokenizers_available() else None),
("t5gemma", "GemmaTokenizer" if is_tokenizers_available() else None),
("tapas", "TapasTokenizer"),
("trocr", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("tvp", "BertTokenizer" if is_tokenizers_available() else None),
("udop", "UdopTokenizer" if is_tokenizers_available() else None),
("umt5", "T5Tokenizer" if is_tokenizers_available() else None),
("unispeech", "Wav2Vec2CTCTokenizer"),
("unispeech-sat", "Wav2Vec2CTCTokenizer"),
("vilt", "BertTokenizer" if is_tokenizers_available() else None),
("visual_bert", "BertTokenizer" if is_tokenizers_available() else None),
("vits", "VitsTokenizer"),
(
"voxtral",
"MistralCommonBackend"
if is_mistral_common_available()
else ("TokenizersBackend" if is_tokenizers_available() else None),
),
("wav2vec2", "Wav2Vec2CTCTokenizer"),
("wav2vec2-bert", "Wav2Vec2CTCTokenizer"),
("wav2vec2-conformer", "Wav2Vec2CTCTokenizer"),
("wav2vec2_phoneme", "Wav2Vec2PhonemeCTCTokenizer"),
("whisper", "WhisperTokenizer" if is_tokenizers_available() else None),
("xclip", "CLIPTokenizer" if is_tokenizers_available() else None),
("xglm", "XGLMTokenizer" if is_tokenizers_available() else None),
("xlm", "XLMTokenizer"),
("xlm-roberta", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("xlm-roberta-xl", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("xlnet", "XLNetTokenizer" if is_tokenizers_available() else None),
("xlstm", "GPTNeoXTokenizer" if is_tokenizers_available() else None),
("xmod", "XLMRobertaTokenizer" if is_tokenizers_available() else None),
("yoso", "AlbertTokenizer" if is_tokenizers_available() else None),
]
)
TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES)
CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()}
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
with open(vocab_file, "r", encoding="utf-8") as reader:
return json.load(reader)
def load_merges(merges_file):
"""Loads a merges file into a list."""
merges = []
with open(merges_file, "r", encoding="utf-8") as reader:
for line in reader:
line = line.strip()
if line and not line.startswith("#"):
merges.append(tuple(line.split()))
return merges
def tokenizer_class_from_name(class_name: str) -> type[Any] | None:
# Bloom tokenizer classes were removed but should map to the fast backend for BC
if class_name in {"BloomTokenizer", "BloomTokenizerFast"}:
return TokenizersBackend
if class_name in REGISTERED_FAST_ALIASES:
return REGISTERED_FAST_ALIASES[class_name]
if class_name in REGISTERED_TOKENIZER_CLASSES:
return REGISTERED_TOKENIZER_CLASSES[class_name]
if class_name == "TokenizersBackend":
return TokenizersBackend
# V5: TOKENIZER_MAPPING_NAMES now maps to single strings, not tuples
for module_name, tokenizer_class in TOKENIZER_MAPPING_NAMES.items():
if tokenizer_class == class_name:
module_name = model_type_to_module_name(module_name)
if (
module_name in ["mistral", "mistral3", "mixtral", "ministral", "ministral3", "pixtral", "voxtral"]
and class_name == "MistralCommonBackend"
):
module = importlib.import_module(".tokenization_mistral_common", "transformers")
else:
module = importlib.import_module(f".{module_name}", "transformers.models")
try:
return getattr(module, class_name)
except AttributeError:
continue
for tokenizer in TOKENIZER_MAPPING._extra_content.values():
if getattr(tokenizer, "__name__", None) == class_name:
return tokenizer
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main
# init and we return the proper dummy to get an appropriate error message.
main_module = importlib.import_module("transformers")
if hasattr(main_module, class_name):
return getattr(main_module, class_name)
return None
def get_tokenizer_config(
pretrained_model_name_or_path: str | os.PathLike[str],
cache_dir: str | os.PathLike[str] | None = None,
force_download: bool = False,
proxies: dict[str, str] | None = None,
token: bool | str | None = None,
revision: str | None = None,
local_files_only: bool = False,
subfolder: str = "",
**kwargs,
) -> dict[str, Any]:
"""
Loads the tokenizer configuration from a pretrained model tokenizer configuration.
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`):
This can be either:
- a string, the *model id* of a pretrained model configuration hosted inside a model repo on
huggingface.co.
- a path to a *directory* containing a configuration file saved using the
[`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force to (re-)download the configuration files and override the cached versions if they
exist.
proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
token (`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
when running `hf auth login` (stored in `~/.huggingface`).
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
local_files_only (`bool`, *optional*, defaults to `False`):
If `True`, will only try to load the tokenizer configuration from local files.
subfolder (`str`, *optional*, defaults to `""`):
In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
specify the folder name here.
<Tip>
Passing `token=True` is required when you want to use a private model.
</Tip>
Returns:
`dict`: The configuration of the tokenizer.
Examples:
```python
# Download configuration from huggingface.co and cache.
tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
# This model does not have a tokenizer config so the result will be an empty dict.
tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")
# Save a pretrained tokenizer locally and you can reload its config
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizer.save_pretrained("tokenizer-test")
tokenizer_config = get_tokenizer_config("tokenizer-test")
```"""
commit_hash = kwargs.get("_commit_hash")
resolved_config_file = cached_file(
pretrained_model_name_or_path,
TOKENIZER_CONFIG_FILE,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
token=token,
revision=revision,
local_files_only=local_files_only,
subfolder=subfolder,
_raise_exceptions_for_gated_repo=False,
_raise_exceptions_for_missing_entries=False,
_raise_exceptions_for_connection_errors=False,
_commit_hash=commit_hash,
)
if resolved_config_file is None:
logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")
return {}
commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
with open(resolved_config_file, encoding="utf-8") as reader:
result = json.load(reader)
result["_commit_hash"] = commit_hash
return result
class AutoTokenizer:
r"""
This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
created with the [`AutoTokenizer.from_pretrained`] class method.
This class cannot be instantiated directly using `__init__()` (throws an error).
"""
def __init__(self):
raise OSError(
"AutoTokenizer is designed to be instantiated "
"using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method."
)
@classmethod
@replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES)
def from_pretrained(
cls, pretrained_model_name_or_path, *inputs, **kwargs
) -> TokenizersBackend | SentencePieceBackend:
r"""
Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
falling back to using pattern matching on `pretrained_model_name_or_path`:
List options
Params:
pretrained_model_name_or_path (`str` or `os.PathLike`):
Can be either:
- A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
- A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
- A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
applicable to all derived classes)
inputs (additional positional arguments, *optional*):
Will be passed along to the Tokenizer `__init__()` method.
config ([`PreTrainedConfig`], *optional*)
The configuration object used to determine the tokenizer class to instantiate.
cache_dir (`str` or `os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained model configuration should be cached if the
standard cache should not be used.
force_download (`bool`, *optional*, defaults to `False`):
Whether or not to force the (re-)download the model weights and configuration files and override the
cached versions if they exist.
proxies (`dict[str, str]`, *optional*):
A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
revision (`str`, *optional*, defaults to `"main"`):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
identifier allowed by git.
subfolder (`str`, *optional*):
In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
facebook/rag-token-base), specify it here.
tokenizer_type (`str`, *optional*):
Tokenizer type to be loaded.
backend (`str`, *optional*, defaults to `"tokenizers"`):
Backend to use for tokenization. Valid options are:
- `"tokenizers"`: Use the HuggingFace tokenizers library backend (default)
- `"sentencepiece"`: Use the SentencePiece backend
trust_remote_code (`bool`, *optional*, defaults to `False`):
Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
should only be set to `True` for repositories you trust and in which you have read the code, as it will
execute code present on the Hub on your local machine.
kwargs (additional keyword arguments, *optional*):
Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
`bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
`additional_special_tokens`. See parameters in the `__init__()` for more details.
Examples:
```python
>>> from transformers import AutoTokenizer
>>> # Download vocabulary from huggingface.co and cache.
>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
>>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
>>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
>>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
>>> # Download vocabulary from huggingface.co and define model-specific arguments
>>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
>>> # Explicitly use the tokenizers backend
>>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="tokenizers")
>>> # Explicitly use the sentencepiece backend
>>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="sentencepiece")
```"""
config = kwargs.pop("config", None)
kwargs["_from_auto"] = True
# V5: Always use fast tokenizers, ignore use_fast parameter
_ = kwargs.pop("use_fast", None)
tokenizer_type = kwargs.pop("tokenizer_type", None)
trust_remote_code = kwargs.pop("trust_remote_code", None)
gguf_file = kwargs.get("gguf_file")
# First, let's see whether the tokenizer_type is passed so that we can leverage it
if tokenizer_type is not None:
tokenizer_class_name = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
if tokenizer_class_name is None:
raise ValueError(
f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}."
)
tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
if tokenizer_class is None:
raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
if gguf_file:
gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
config = AutoConfig.for_model(**config_dict)
elif config is None:
try:
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
)
except Exception:
config = PreTrainedConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
config_model_type = config.model_type
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
# Check for auto_map early to handle dynamic tokenizers properly
tokenizer_auto_map = None
if "auto_map" in tokenizer_config:
if isinstance(tokenizer_config["auto_map"], (tuple, list)):
# Legacy format for dynamic tokenizers
tokenizer_auto_map = tokenizer_config["auto_map"]
else:
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
# if there is a config, we can check that the tokenizer class != than model class and can thus assume we need to use TokenizersBackend
# Skip this early exit if auto_map is present (custom tokenizer with trust_remote_code)
if (
tokenizer_auto_map is None
and tokenizer_config_class is not None
and config_model_type is not None
and config_model_type != ""
and TOKENIZER_MAPPING_NAMES.get(config_model_type, "").replace("Fast", "")
!= tokenizer_config_class.replace("Fast", "")
):
# new model, but we ignore it unless the model type is the same
try:
return TokenizersBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
except Exception:
return tokenizer_class_from_name(tokenizer_config_class).from_pretrained(
pretrained_model_name_or_path, *inputs, **kwargs
)
if "_commit_hash" in tokenizer_config:
kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
if tokenizer_config_class:
tokenizer_config_class = tokenizer_config_class.replace("Fast", "")
has_remote_code = tokenizer_auto_map is not None
has_local_code = type(config) in TOKENIZER_MAPPING or (
tokenizer_config_class is not None
and (
tokenizer_class_from_name(tokenizer_config_class) is not None
or tokenizer_class_from_name(tokenizer_config_class + "Fast") is not None
)
)
if has_remote_code:
# V5: Always prefer fast tokenizer (index 1), fallback to slow (index 0)
if tokenizer_auto_map[1] is not None:
class_ref = tokenizer_auto_map[1]
else:
class_ref = tokenizer_auto_map[0]
if "--" in class_ref:
upstream_repo = class_ref.split("--")[0]
else:
upstream_repo = None
trust_remote_code = resolve_trust_remote_code(
trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code, upstream_repo
)
if has_remote_code and trust_remote_code:
tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
_ = kwargs.pop("code_revision", None)
tokenizer_class.register_for_auto_class()
return tokenizer_class.from_pretrained(
pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs
)
elif tokenizer_config_class is not None:
tokenizer_class_candidate = tokenizer_config_class
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
if tokenizer_class is None and not tokenizer_class_candidate.endswith("Fast"):
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate + "Fast")
if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
tokenizer_class = TokenizersBackend
# Fallback to TokenizersBackend if the class wasn't found
if tokenizer_class is None:
tokenizer_class = TokenizersBackend
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif getattr(config, "tokenizer_class", None):
_class = config.tokenizer_class
if "PreTrainedTokenizerFast" not in _class:
_class = _class.replace("Fast", "")
tokenizer_class = tokenizer_class_from_name(_class)
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Otherwise we have to be creative.
# if model is an encoder decoder, the encoder tokenizer class is used by default
if isinstance(config, EncoderDecoderConfig):
if type(config.decoder) is not type(config.encoder):
logger.warning(
f"The encoder model config class: {config.encoder.__class__} is different from the decoder model "
f"config class: {config.decoder.__class__}. It is not recommended to use the "
"`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder "
"specific tokenizer classes."
)
config = config.encoder
model_type = config_class_to_model_type(type(config).__name__) or getattr(config, "model_type", None)
if model_type is not None:
tokenizer_class = TOKENIZER_MAPPING.get(type(config), TokenizersBackend)
if tokenizer_class is not None:
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Fallback: try tokenizer_class from tokenizer_config.json
tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
if tokenizer_config_class is not None:
if tokenizer_config_class != "TokenizersBackend" and "Fast" in tokenizer_config_class:
tokenizer_config_class = tokenizer_config_class[:-4]
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
if tokenizer_class is None and not tokenizer_config_class.endswith("Fast"):
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class + "Fast")
if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
tokenizer_class = TokenizersBackend
if tokenizer_class is None:
tokenizer_class = TokenizersBackend
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
raise ValueError(
f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"
f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING)}."
)
@staticmethod
def register(
config_class, tokenizer_class=None, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False
):
"""
Register a new tokenizer in this mapping.
Args:
config_class ([`PreTrainedConfig`]):
The configuration corresponding to the model to register.
tokenizer_class: The tokenizer class to register (V5 - preferred parameter).
slow_tokenizer_class: (Deprecated) The slow tokenizer to register.
fast_tokenizer_class: (Deprecated) The fast tokenizer to register.
"""
if tokenizer_class is None:
# Legacy: prefer fast over slow
if fast_tokenizer_class is not None:
tokenizer_class = fast_tokenizer_class
elif slow_tokenizer_class is not None:
tokenizer_class = slow_tokenizer_class
else:
raise ValueError("You need to pass a `tokenizer_class`")
for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class):
if candidate is not None:
REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate
if slow_tokenizer_class is not None and fast_tokenizer_class is not None:
REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class
TOKENIZER_MAPPING.register(config_class, tokenizer_class, exist_ok=exist_ok)
__all__ = ["TOKENIZER_MAPPING", "AutoTokenizer"]