Add copy to a decostream (#1930)

ArthurZucker · web-flow · commit b3889ab98a6a · 2026-01-19T11:07:08.000+01:00
* Fix warnings: remove a print and remove some deprecation warnings (#1924) * make sur the warning is just a warning * update * nits * fix tests * add copy test and prefill copy
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -685,6 +685,13 @@ impl PyDecodeStream {
         ))
         .into()
     }
+    fn __copy__(&self) -> Self {
+        self.clone()
+    }
+
+    fn __deepcopy__(&self, _memo: &Bound<'_, PyDict>) -> Self {
+        self.clone()
+    }
 }
 
 #[cfg(test)]
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -1,4 +1,5 @@
 import pickle
+import copy
 import concurrent.futures
 import pytest
 import numpy as np
@@ -374,6 +375,27 @@ def test_decode(self):
         stream = DecodeStream(ids=[0, 1, 2])
         assert stream.step(tokenizer, 3) == " john"
 
+    def test_decode_stream_copy_and_prefix_ids(self):
+        tokenizer = Tokenizer(BPE())
+        tokenizer.add_tokens(["my", "name", "is", "john"])
+        token_ids = [0, 1, 2, 3]
+
+        stream = DecodeStream(skip_special_tokens=False)
+        assert stream.step(tokenizer, token_ids[0]) == "my"
+        assert stream.step(tokenizer, token_ids[1]) == " name"
+        stream_copy = copy.copy(stream)
+        assert stream.step(tokenizer, token_ids[2]) == " is"
+        assert stream_copy.step(tokenizer, token_ids[2]) == " is"
+        assert stream.step(tokenizer, token_ids[3]) == " john"
+        assert stream_copy.step(tokenizer, token_ids[3]) == " john"
+
+        stream_steps = DecodeStream([])
+        last_chunk = None
+        for tid in token_ids:
+            last_chunk = stream_steps.step(tokenizer, tid)
+        stream_prefill = DecodeStream(token_ids[:-1])
+        assert stream_prefill.step(tokenizer, token_ids[-1]) == last_chunk
+
     def test_decode_stream_fallback(self):
         tokenizer = Tokenizer.from_pretrained("gpt2")
         # tokenizer.decode([255]) fails because its a fallback

Original file line number	Diff line number	Diff line change
`@@ -685,6 +685,13 @@ impl PyDecodeStream {`
`685`	`685`	`))`
`686`	`686`	`.into()`
`687`	`687`	`}`
	`688`	`+ fn __copy__(&self) -> Self {`
	`689`	`+ self.clone()`
	`690`	`+ }`
	`691`	`+`
	`692`	`+ fn __deepcopy__(&self, _memo: &Bound<'_, PyDict>) -> Self {`
	`693`	`+ self.clone()`
	`694`	`+ }`
`688`	`695`	`}`
`689`	`696`
`690`	`697`	`#[cfg(test)]`