some update

ArthurZucker · ArthurZucker · commit ad9a3c349a98 · 2025-11-28T12:40:02.000+01:00
diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi
@@ -125,6 +125,9 @@ class Encoding:
     """
     The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
     """
+    def __init__(self):
+        pass
+
     @property
     def attention_mask(self):
         """
@@ -627,6 +630,9 @@ class NormalizedString:
         sequence: str:
             The string sequence used to initialize this NormalizedString
     """
+    def __init__(self, sequence):
+        pass
+
     def append(self, s):
         """
         Append the given sequence to the string
@@ -887,6 +893,9 @@ class Regex:
         pass
 
 class Token:
+    def __init__(self, id, value, offsets):
+        pass
+
     @property
     def id(self):
         """ """
diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
@@ -7,6 +7,23 @@ class DecodeStream:
     def __init__(self, ids=None, skip_special_tokens=False):
         pass
 
+    def step(self, tokenizer, id):
+        """
+        Streaming decode step
+
+        Args:
+            tokenizer (:class:`~tokenizers.Tokenizer`):
+               The tokenizer to use for decoding
+           id (:obj:`int` or `List[int]`):
+              The next token id or list of token ids to add to the stream
+
+
+        Returns:
+            :obj:`Optional[str]`: The next decoded string chunk, or None if not enough
+                tokens have been provided yet.
+        """
+        pass
+
 class Decoder:
     """
     Base class for all decoders
diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi
@@ -8,6 +8,9 @@ class Model:
 
     This class cannot be constructed directly. Please use one of the concrete models.
     """
+    def __init__(self):
+        pass
+
     def get_trainer(self):
         """
         Get the associated :class:`~tokenizers.trainers.Trainer`
diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -197,6 +197,9 @@ class CharDelimiterSplit(PreTokenizer):
         delimiter: str:
             The delimiter char that will be used to split input
     """
+    def __init__(self, delimiter):
+        pass
+
     @property
     def delimiter(self):
         """ """
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -678,6 +678,19 @@ impl PyDecodeStream {
             prefix_index: 0,
         }
     }
+
+    /// Streaming decode step
+    ///
+    /// Args:
+    ///     tokenizer (:class:`~tokenizers.Tokenizer`):
+    ///        The tokenizer to use for decoding
+    ///    id (:obj:`int` or `List[int]`):
+    ///       The next token id or list of token ids to add to the stream
+    /// 
+    ///
+    /// Returns:
+    ///     :obj:`Optional[str]`: The next decoded string chunk, or None if not enough
+    ///         tokens have been provided yet.
     #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")]
     fn step(&mut self, tokenizer: &PyTokenizer, id: StreamInput) -> PyResult<Option<String>> {
         let id: Vec<u32> = match id {
diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs
@@ -23,7 +23,7 @@ impl From<tk::tokenizer::Encoding> for PyEncoding {
 #[pymethods]
 impl PyEncoding {
     #[new]
-    #[pyo3(text_signature = None)]
+    #[pyo3(signature = (), text_signature = "(self)")]
     fn new() -> Self {
         Self {
             encoding: tk::tokenizer::Encoding::default(),
diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
@@ -102,7 +102,7 @@ where
 #[pymethods]
 impl PyModel {
     #[new]
-    #[pyo3(text_signature = None)]
+    #[pyo3(signature = (), text_signature = "(self)")]
     fn __new__() -> Self {
         // Instantiate a default empty model. This doesn't really make sense, but we need
         // to be able to instantiate an empty model for pickle capabilities.
diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
@@ -507,7 +507,7 @@ impl PyCharDelimiterSplit {
     }
 
     #[new]
-    #[pyo3(text_signature = None)]
+    #[pyo3(signature = (delimiter), text_signature = "(self, delimiter)")]
     pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> {
         Ok((
             PyCharDelimiterSplit {},
diff --git a/bindings/python/src/token.rs b/bindings/python/src/token.rs
@@ -19,8 +19,12 @@ impl From<PyToken> for Token {
 
 #[pymethods]
 impl PyToken {
+    /// Create a token from id, string value and byte offsets
     #[new]
-    #[pyo3(text_signature = None)]
+    #[pyo3(
+        signature = (id, value, offsets),
+        text_signature = "(self, id, value, offsets)"
+    )]
     fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
         Token::new(id, value, offsets).into()
     }
diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs
@@ -205,9 +205,9 @@ pub struct PyNormalizedString {
 #[pymethods]
 impl PyNormalizedString {
     #[new]
-    #[pyo3(text_signature = None)]
-    fn new(s: &str) -> Self {
-        NormalizedString::from(s).into()
+    #[pyo3(signature = (sequence), text_signature = "(self, sequence)")]
+    fn new(sequence: &str) -> Self {
+        NormalizedString::from(sequence).into()
     }
 
     /// The normalized part of the string

Original file line number	Diff line number	Diff line change
`@@ -507,7 +507,7 @@ impl PyCharDelimiterSplit {`
`507`	`507`	`}`
`508`	`508`
`509`	`509`	`#[new]`
`510`		`- #[pyo3(text_signature = None)]`
	`510`	`+ #[pyo3(signature = (delimiter), text_signature = "(self, delimiter)")]`
`511`	`511`	`pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> {`
`512`	`512`	`Ok((`
`513`	`513`	`PyCharDelimiterSplit {},`