@@ -30,7 +30,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
3030 num_bytes = sum (map (len , map (str .encode , documents )))
3131 readable_size , unit = format_byte_size (num_bytes )
3232 print (f"==============" )
33- print (f"num_threads: { num_threads } , data size: { readable_size } , documents: { len (documents )} Avg Length: { document_length :.0f} " )
33+ print (
34+ f"num_threads: { num_threads } , data size: { readable_size } , documents: { len (documents )} Avg Length: { document_length :.0f} "
35+ )
3436 filename = hf_hub_download (MODEL_ID , "original/tokenizer.model" )
3537 mergeable_ranks = load_tiktoken_bpe (filename )
3638 pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
@@ -46,20 +48,15 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
4648 "<|end_header_id|>" ,
4749 "<|reserved_special_token_4|>" ,
4850 "<|eot_id|>" , # end of turn
49- ] + [
50- f"<|reserved_special_token_{ i } |>"
51- for i in range (5 , num_reserved_special_tokens - 5 )
52- ]
51+ ] + [f"<|reserved_special_token_{ i } |>" for i in range (5 , num_reserved_special_tokens - 5 )]
5352 num_base_tokens = len (mergeable_ranks )
54- special_tokens = {
55- token : num_base_tokens + i for i , token in enumerate (special_tokens )
56- }
53+ special_tokens = {token : num_base_tokens + i for i , token in enumerate (special_tokens )}
5754 enc = tiktoken .Encoding (
58- name = model ,
59- pat_str = pat_str ,
60- mergeable_ranks = mergeable_ranks ,
61- special_tokens = special_tokens ,
62- )
55+ name = model ,
56+ pat_str = pat_str ,
57+ mergeable_ranks = mergeable_ranks ,
58+ special_tokens = special_tokens ,
59+ )
6360 out = enc .encode ("This is a test" )
6461
6562 hf_enc = Tokenizer .from_pretrained (model )
@@ -74,7 +71,6 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
7471 readable_size , unit = format_byte_size (num_bytes / (end - start ) * 1e9 )
7572 print (f"tiktoken \t { readable_size } / s" )
7673
77-
7874 start = time .perf_counter_ns ()
7975 hf_enc .encode_batch_fast (documents )
8076 end = time .perf_counter_ns ()
@@ -98,7 +94,7 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
9894 else :
9995 documents .append (item ["premise" ]["en" ])
10096 if fuse :
101- documents = ["" .join (documents )]
97+ documents = ["" .join (documents )]
10298
10399 document_length = sum (len (d ) for d in documents ) / len (documents )
104100
@@ -115,15 +111,14 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
115111
116112
117113def main ():
118-
119114 parser = argparse .ArgumentParser (
120- prog = ' bench_tokenizer' ,
121- description = ' Getting a feel for speed when tokenizing' ,
115+ prog = " bench_tokenizer" ,
116+ description = " Getting a feel for speed when tokenizing" ,
122117 )
123- parser .add_argument ('-m' , ' --model' , default = MODEL_ID , type = str )
124- parser .add_argument ('-d' , ' --dataset' , default = DATASET , type = str )
125- parser .add_argument (' -ds' , ' --dataset-config' , default = DATASET_CONFIG , type = str )
126- parser .add_argument ('-t' , ' --threads' , nargs = '+' , default = DEFAULT_THREADS , type = int )
118+ parser .add_argument ("-m" , " --model" , default = MODEL_ID , type = str )
119+ parser .add_argument ("-d" , " --dataset" , default = DATASET , type = str )
120+ parser .add_argument (" -ds" , " --dataset-config" , default = DATASET_CONFIG , type = str )
121+ parser .add_argument ("-t" , " --threads" , nargs = "+" , default = DEFAULT_THREADS , type = int )
127122 args = parser .parse_args ()
128123 test (args .model , args .dataset , args .dataset_config , args .threads )
129124
0 commit comments