Merge branch 'main' of github.com:huggingface/tokenizers into fix-stub

ArthurZucker · ArthurZucker · commit 19d1f5f47cad · 2025-11-28T18:47:21.000+01:00
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -690,7 +690,7 @@ impl PyDecodeStream {
     ///        The tokenizer to use for decoding
     ///    id (:obj:`int` or `List[int]`):
     ///       The next token id or list of token ids to add to the stream
-    /// 
+    ///
     ///
     /// Returns:
     ///     :obj:`Optional[str]`: The next decoded string chunk, or None if not enough
diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs
@@ -37,22 +37,13 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 
 // For users using multiprocessing in python, it is quite easy to fork the process running
 // tokenizers, ending up with a deadlock because we internally make use of multithreading. So
-// we register a callback to be called in the event of a fork so that we can warn the user.
+// we register a callback to be called in the event of a fork to disable parallelism.
 #[cfg(target_family = "unix")]
 static mut REGISTERED_FORK_CALLBACK: bool = false;
 #[cfg(target_family = "unix")]
 extern "C" fn child_after_fork() {
     use tk::parallelism::*;
     if has_parallelism_been_used() && !is_parallelism_configured() {
-        eprintln!(
-            "huggingface/tokenizers: The current process just got forked, after parallelism has \
-            already been used. Disabling parallelism to avoid deadlocks..."
-        );
-        eprintln!("To disable this warning, you can either:");
-        eprintln!(
-            "\t- Avoid using `tokenizers` before the fork if possible\n\
-            \t- Explicitly set the environment variable {ENV_VARIABLE}=(true | false)"
-        );
         set_parallelism(false);
     }
 }
diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
@@ -484,6 +484,11 @@ impl PyRobertaProcessing {
 /// Args:
 ///     trim_offsets (:obj:`bool`):
 ///         Whether to trim the whitespaces from the produced offsets.
+///
+///     add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+///         If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
+///         the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
+///         is set to :obj:`True`.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
 pub struct PyByteLevel {}
 #[pymethods]
diff --git a/bindings/python/src/token.rs b/bindings/python/src/token.rs
@@ -1,7 +1,7 @@
 use pyo3::prelude::*;
 use tk::Token;
 
-#[pyclass(module = "tokenizers", name = "Token")]
+#[pyclass(module = "tokenizers", name = "Token", frozen)]
 #[derive(Clone)]
 pub struct PyToken {
     token: Token,
diff --git a/bindings/python/src/utils/regex.rs b/bindings/python/src/utils/regex.rs
@@ -3,7 +3,7 @@ use pyo3::prelude::*;
 use tk::utils::SysRegex;
 
 /// Instantiate a new Regex with the given pattern
-#[pyclass(module = "tokenizers", name = "Regex")]
+#[pyclass(module = "tokenizers", name = "Regex", frozen)]
 pub struct PyRegex {
     pub inner: SysRegex,
     pub pattern: String,
diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py
@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
     def test_instantiate(self):
         assert ByteLevel() is not None
         assert ByteLevel(trim_offsets=True) is not None
+        assert ByteLevel(add_prefix_space=True) is not None
         assert isinstance(ByteLevel(), PostProcessor)
         assert isinstance(ByteLevel(), ByteLevel)
         assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
@@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
         assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
 
         # Trims offsets when activated
-        tokenizer.post_processor = ByteLevel(trim_offsets=True)
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
         output = tokenizer.encode("My name is John")
         assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
 
+        # Trims offsets without adding prefix space at first token
+        tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]
+
+        # add_prefix_space without trimming offsets has no effect
+        tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
+
     def test_manual_reload(self):
         byte_level = ByteLevel()
         state = json.loads(byte_level.__getstate__())