Skip to content

Commit 19d1f5f

Browse files
committed
Merge branch 'main' of github.com:huggingface/tokenizers into fix-stub
2 parents 810aa57 + b83d7c9 commit 19d1f5f

File tree

6 files changed

+23
-14
lines changed

6 files changed

+23
-14
lines changed

bindings/python/src/decoders.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ impl PyDecodeStream {
690690
/// The tokenizer to use for decoding
691691
/// id (:obj:`int` or `List[int]`):
692692
/// The next token id or list of token ids to add to the stream
693-
///
693+
///
694694
///
695695
/// Returns:
696696
/// :obj:`Optional[str]`: The next decoded string chunk, or None if not enough

bindings/python/src/lib.rs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,22 +37,13 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
3737

3838
// For users using multiprocessing in python, it is quite easy to fork the process running
3939
// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
40-
// we register a callback to be called in the event of a fork so that we can warn the user.
40+
// we register a callback to be called in the event of a fork to disable parallelism.
4141
#[cfg(target_family = "unix")]
4242
static mut REGISTERED_FORK_CALLBACK: bool = false;
4343
#[cfg(target_family = "unix")]
4444
extern "C" fn child_after_fork() {
4545
use tk::parallelism::*;
4646
if has_parallelism_been_used() && !is_parallelism_configured() {
47-
eprintln!(
48-
"huggingface/tokenizers: The current process just got forked, after parallelism has \
49-
already been used. Disabling parallelism to avoid deadlocks..."
50-
);
51-
eprintln!("To disable this warning, you can either:");
52-
eprintln!(
53-
"\t- Avoid using `tokenizers` before the fork if possible\n\
54-
\t- Explicitly set the environment variable {ENV_VARIABLE}=(true | false)"
55-
);
5647
set_parallelism(false);
5748
}
5849
}

bindings/python/src/processors.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,11 @@ impl PyRobertaProcessing {
484484
/// Args:
485485
/// trim_offsets (:obj:`bool`):
486486
/// Whether to trim the whitespaces from the produced offsets.
487+
///
488+
/// add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
489+
/// If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
490+
/// the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
491+
/// is set to :obj:`True`.
487492
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
488493
pub struct PyByteLevel {}
489494
#[pymethods]

bindings/python/src/token.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use pyo3::prelude::*;
22
use tk::Token;
33

4-
#[pyclass(module = "tokenizers", name = "Token")]
4+
#[pyclass(module = "tokenizers", name = "Token", frozen)]
55
#[derive(Clone)]
66
pub struct PyToken {
77
token: Token,

bindings/python/src/utils/regex.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use pyo3::prelude::*;
33
use tk::utils::SysRegex;
44

55
/// Instantiate a new Regex with the given pattern
6-
#[pyclass(module = "tokenizers", name = "Regex")]
6+
#[pyclass(module = "tokenizers", name = "Regex", frozen)]
77
pub struct PyRegex {
88
pub inner: SysRegex,
99
pub pattern: String,

bindings/python/tests/bindings/test_processors.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
6666
def test_instantiate(self):
6767
assert ByteLevel() is not None
6868
assert ByteLevel(trim_offsets=True) is not None
69+
assert ByteLevel(add_prefix_space=True) is not None
6970
assert isinstance(ByteLevel(), PostProcessor)
7071
assert isinstance(ByteLevel(), ByteLevel)
7172
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
@@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
8283
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
8384

8485
# Trims offsets when activated
85-
tokenizer.post_processor = ByteLevel(trim_offsets=True)
86+
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=True)
8687
output = tokenizer.encode("My name is John")
8788
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
8889
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
8990

91+
# Trims offsets without adding prefix space at first token
92+
tokenizer.post_processor = ByteLevel(trim_offsets=True, add_prefix_space=False)
93+
output = tokenizer.encode("My name is John")
94+
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
95+
assert output.offsets == [(1, 2), (3, 7), (8, 10), (11, 15)]
96+
97+
# add_prefix_space without trimming offsets has no effect
98+
tokenizer.post_processor = ByteLevel(trim_offsets=False, add_prefix_space=True)
99+
output = tokenizer.encode("My name is John")
100+
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
101+
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
102+
90103
def test_manual_reload(self):
91104
byte_level = ByteLevel()
92105
state = json.loads(byte_level.__getstate__())

0 commit comments

Comments
 (0)