@@ -66,6 +66,7 @@ class TestByteLevelProcessing:
6666 def test_instantiate (self ):
6767 assert ByteLevel () is not None
6868 assert ByteLevel (trim_offsets = True ) is not None
69+ assert ByteLevel (add_prefix_space = True ) is not None
6970 assert isinstance (ByteLevel (), PostProcessor )
7071 assert isinstance (ByteLevel (), ByteLevel )
7172 assert isinstance (pickle .loads (pickle .dumps (ByteLevel ())), ByteLevel )
@@ -82,11 +83,23 @@ def test_processing(self, roberta_files):
8283 assert output .offsets == [(0 , 2 ), (2 , 7 ), (7 , 10 ), (10 , 15 )]
8384
8485 # Trims offsets when activated
85- tokenizer .post_processor = ByteLevel (trim_offsets = True )
86+ tokenizer .post_processor = ByteLevel (trim_offsets = True , add_prefix_space = True )
8687 output = tokenizer .encode ("My name is John" )
8788 assert output .tokens == ["ĠMy" , "Ġname" , "Ġis" , "ĠJohn" ]
8889 assert output .offsets == [(0 , 2 ), (3 , 7 ), (8 , 10 ), (11 , 15 )]
8990
91+ # Trims offsets without adding prefix space at first token
92+ tokenizer .post_processor = ByteLevel (trim_offsets = True , add_prefix_space = False )
93+ output = tokenizer .encode ("My name is John" )
94+ assert output .tokens == ["ĠMy" , "Ġname" , "Ġis" , "ĠJohn" ]
95+ assert output .offsets == [(1 , 2 ), (3 , 7 ), (8 , 10 ), (11 , 15 )]
96+
97+ # add_prefix_space without trimming offsets has no effect
98+ tokenizer .post_processor = ByteLevel (trim_offsets = False , add_prefix_space = True )
99+ output = tokenizer .encode ("My name is John" )
100+ assert output .tokens == ["ĠMy" , "Ġname" , "Ġis" , "ĠJohn" ]
101+ assert output .offsets == [(0 , 2 ), (2 , 7 ), (7 , 10 ), (10 , 15 )]
102+
90103 def test_manual_reload (self ):
91104 byte_level = ByteLevel ()
92105 state = json .loads (byte_level .__getstate__ ())
0 commit comments