fix: multiword broken (#317)

stephantul · web-flow · commit b3012ee04e41 · 2026-04-03T13:54:27.000+02:00
* fix: multiword broken

* increase version
diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py
@@ -88,12 +88,6 @@ def distill_from_model(
 
     # Create the vocabulary in the new tokenizer.
     tokenizer_model = clean_and_create_vocabulary(tokenizer_model, vocabulary, token_remove_regex=token_remove_regex)
-    # Remove the post processor, this is not necessary.
-    tokenizer_model.post_processor = None
-    # Prune again now that the post processor is gone.
-    # We can't do this before because we need the post processor and associated
-    # tokens before to add eos/bos.
-    tokenizer_model = tokenizer_model.prune_added_tokens()
 
     # All tokens in a single list.
     all_tokens = tokenizer_model.sorted_vocabulary
diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py
@@ -13,7 +13,18 @@ def clean_and_create_vocabulary(
     vocabulary_to_add: list[str],
     token_remove_regex: re.Pattern[str] | None,
 ) -> TokenizerModel:
-    """Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary."""
+    """
+    Clean a vocabulary by removing duplicates and tokens that were already in the vocabulary.
+
+    This function removes duplicate tokens and tokens that are already in the model's vocabulary.
+    It also removes the tokenizer's post-processor, which we do not use anyway.
+
+    :param model: The tokenizer model to clean.
+    :param vocabulary_to_add: The vocabulary to add to the model. Any tokens in this vocabulary that
+        are split according to the pretokenizer are added as multiword tokens.
+    :param token_remove_regex: A regex pattern to remove tokens from the vocabulary.
+    :return: The cleaned tokenizer model.
+    """
     seen_tokens = set()
 
     n_duplicate = 0
@@ -39,7 +50,9 @@ def clean_and_create_vocabulary(
         if len(preprocessed) > 1:
             tokens_as_str = [f"'{subword}'" for subword in preprocessed]
             split_into = ",".join(tokens_as_str)
-            logger.warning(f"Token '{token}' was split into multiple tokens after preprocessing: [{split_into}]")
+            logger.warning(
+                f"Token '{token}' was split into multiple tokens after preprocessing: [{split_into}], adding it as a multi-word token."
+            )
             added_tokens_to_add.append(token)
             continue
         token = preprocessed[0]
@@ -54,6 +67,8 @@ def clean_and_create_vocabulary(
         seen_tokens.add(token)
         tokens_to_add.append(token)
 
+    model.post_processor = None
+    model = model.prune_added_tokens()
     model = model.add_tokens_to_vocabulary(tokens_to_add, preprocess_tokens=True)
     model = model.add_addedtokens(added_tokens_to_add, is_special=False, single_word=False, normalized=True)
 
diff --git a/model2vec/version.py b/model2vec/version.py
@@ -1,2 +1,2 @@
-__version_triple__ = (0, 8, 1)
+__version_triple__ = (0, 8, 2)
 __version__ = ".".join(map(str, __version_triple__))
diff --git a/tests/test_distillation.py b/tests/test_distillation.py
@@ -37,6 +37,7 @@
         (None, 1024, None),  # Subword, PCA set high, SIF off
         (None, None, 1e-4),  # No PCA, SIF on
         (None, 0.9, 1e-4),  # PCA as float (variance), SIF on
+        (["star wars"], 8, None),  # Multiword vocabulary
     ],
 )
 @patch.object(import_module("model2vec.distill.distillation"), "model_info")
@@ -79,6 +80,12 @@ def test_distill_from_model(
     assert json.loads(static_model.tokenizer.to_str()) == json.loads(static_model2.tokenizer.to_str())
     assert static_model.base_model_name == static_model2.base_model_name
 
+    for token in vocabulary or []:
+        # Normalized tokens are for single-word tokens.
+        # Other tokens are added as addedtokens, as is.
+        normalized = static_model.tokenizer.normalizer.normalize_str(token)
+        assert token in static_model.tokens or normalized in static_model.tokens
+
 
 @patch.object(import_module("model2vec.distill.distillation"), "model_info")
 @patch("transformers.AutoModel.from_pretrained")

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-__version_triple__ = (0, 8, 1)`
	`1`	`+__version_triple__ = (0, 8, 2)`
`2`	`2`	`__version__ = ".".join(map(str, __version_triple__))`