@@ -13,7 +13,18 @@ def clean_and_create_vocabulary(
1313 vocabulary_to_add : list [str ],
1414 token_remove_regex : re .Pattern [str ] | None ,
1515) -> TokenizerModel :
16- """Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary."""
16+ """
17+ Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary.
18+
19+ This function removes duplicate tokens and tokens that are already in the model's vocabulary.
20+ It also removes the tokenizer's post-processor, which we do not use anyway.
21+
22+ :param model: The tokenizer model to clean.
23+ :param vocabulary_to_add: The vocabulary to add to the model. Any tokens in this vocabulary that
24+ are split according to the pretokenizer are added as multiword tokens.
25+ :param token_remove_regex: A regex pattern to remove tokens from the vocabulary.
26+ :return: The cleaned tokenizer model.
27+ """
1728 seen_tokens = set ()
1829
1930 n_duplicate = 0
@@ -39,7 +50,9 @@ def clean_and_create_vocabulary(
3950 if len (preprocessed ) > 1 :
4051 tokens_as_str = [f"'{ subword } '" for subword in preprocessed ]
4152 split_into = "," .join (tokens_as_str )
42- logger .warning (f"Token '{ token } ' was split into multiple tokens after preprocessing: [{ split_into } ]" )
53+ logger .warning (
54+ f"Token '{ token } ' was split into multiple tokens after preprocessing: [{ split_into } ], adding it as a multi-word token."
55+ )
4356 added_tokens_to_add .append (token )
4457 continue
4558 token = preprocessed [0 ]
@@ -54,6 +67,8 @@ def clean_and_create_vocabulary(
5467 seen_tokens .add (token )
5568 tokens_to_add .append (token )
5669
70+ model .post_processor = None
71+ model = model .prune_added_tokens ()
5772 model = model .add_tokens_to_vocabulary (tokens_to_add , preprocess_tokens = True )
5873 model = model .add_addedtokens (added_tokens_to_add , is_special = False , single_word = False , normalized = True )
5974
0 commit comments