Skip to content

Commit 72c8f84

Browse files
committed
wtf
1 parent d7f249f commit 72c8f84

File tree

9 files changed

+730
-31
lines changed

9 files changed

+730
-31
lines changed

bindings/python/benches/test_tiktoken.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
3030
num_bytes = sum(map(len, map(str.encode, documents)))
3131
readable_size, unit = format_byte_size(num_bytes)
3232
print(f"==============")
33-
print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}")
33+
print(
34+
f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)} Avg Length: {document_length:.0f}"
35+
)
3436
filename = hf_hub_download(MODEL_ID, "original/tokenizer.model")
3537
mergeable_ranks = load_tiktoken_bpe(filename)
3638
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
@@ -46,20 +48,15 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
4648
"<|end_header_id|>",
4749
"<|reserved_special_token_4|>",
4850
"<|eot_id|>", # end of turn
49-
] + [
50-
f"<|reserved_special_token_{i}|>"
51-
for i in range(5, num_reserved_special_tokens - 5)
52-
]
51+
] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
5352
num_base_tokens = len(mergeable_ranks)
54-
special_tokens = {
55-
token: num_base_tokens + i for i, token in enumerate(special_tokens)
56-
}
53+
special_tokens = {token: num_base_tokens + i for i, token in enumerate(special_tokens)}
5754
enc = tiktoken.Encoding(
58-
name=model,
59-
pat_str=pat_str,
60-
mergeable_ranks=mergeable_ranks,
61-
special_tokens=special_tokens,
62-
)
55+
name=model,
56+
pat_str=pat_str,
57+
mergeable_ranks=mergeable_ranks,
58+
special_tokens=special_tokens,
59+
)
6360
out = enc.encode("This is a test")
6461

6562
hf_enc = Tokenizer.from_pretrained(model)
@@ -74,7 +71,6 @@ def benchmark_batch(model: str, documents: list[str], num_threads: int, document
7471
readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9)
7572
print(f"tiktoken \t{readable_size} / s")
7673

77-
7874
start = time.perf_counter_ns()
7975
hf_enc.encode_batch_fast(documents)
8076
end = time.perf_counter_ns()
@@ -98,7 +94,7 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
9894
else:
9995
documents.append(item["premise"]["en"])
10096
if fuse:
101-
documents=["".join(documents)]
97+
documents = ["".join(documents)]
10298

10399
document_length = sum(len(d) for d in documents) / len(documents)
104100

@@ -115,15 +111,14 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
115111

116112

117113
def main():
118-
119114
parser = argparse.ArgumentParser(
120-
prog='bench_tokenizer',
121-
description='Getting a feel for speed when tokenizing',
115+
prog="bench_tokenizer",
116+
description="Getting a feel for speed when tokenizing",
122117
)
123-
parser.add_argument('-m', '--model', default=MODEL_ID, type=str)
124-
parser.add_argument('-d', '--dataset', default=DATASET, type=str)
125-
parser.add_argument('-ds', '--dataset-config', default=DATASET_CONFIG, type=str)
126-
parser.add_argument('-t', '--threads', nargs='+', default=DEFAULT_THREADS, type=int)
118+
parser.add_argument("-m", "--model", default=MODEL_ID, type=str)
119+
parser.add_argument("-d", "--dataset", default=DATASET, type=str)
120+
parser.add_argument("-ds", "--dataset-config", default=DATASET_CONFIG, type=str)
121+
parser.add_argument("-t", "--threads", nargs="+", default=DEFAULT_THREADS, type=int)
127122
args = parser.parse_args()
128123
test(args.model, args.dataset, args.dataset_config, args.threads)
129124

0 commit comments

Comments
 (0)