Skip to content

Commit 30c92dc

Browse files
author
Lőrinc
committed
Add possessive quantifiers to legacy encodings as well
1 parent 814a09d commit 30c92dc

File tree

2 files changed

+5
-4
lines changed

2 files changed

+5
-4
lines changed

tests/test_encoding.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES
1212

1313

14+
@pytest.mark.skip(reason="Takes a really long time to finish, but was added to reproduce a crash.")
1415
def test_extremely_big_encoding():
1516
big_value = "^" * 1000000
1617
enc = tiktoken.get_encoding("r50k_base")

tiktoken_ext/openai_public.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def gpt2():
2020
# The pattern in the original GPT-2 release is:
2121
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
2222
# This is equivalent, but executes faster:
23-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
23+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s+""",
2424
"mergeable_ranks": mergeable_ranks,
2525
"special_tokens": {ENDOFTEXT: 50256},
2626
}
@@ -34,7 +34,7 @@ def r50k_base():
3434
return {
3535
"name": "r50k_base",
3636
"explicit_n_vocab": 50257,
37-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
37+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s+""",
3838
"mergeable_ranks": mergeable_ranks,
3939
"special_tokens": {ENDOFTEXT: 50256},
4040
}
@@ -48,7 +48,7 @@ def p50k_base():
4848
return {
4949
"name": "p50k_base",
5050
"explicit_n_vocab": 50281,
51-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
51+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s+""",
5252
"mergeable_ranks": mergeable_ranks,
5353
"special_tokens": {ENDOFTEXT: 50256},
5454
}
@@ -62,7 +62,7 @@ def p50k_edit():
6262
special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
6363
return {
6464
"name": "p50k_edit",
65-
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
65+
"pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++|\s+(?!\S)|\s+""",
6666
"mergeable_ranks": mergeable_ranks,
6767
"special_tokens": special_tokens,
6868
}

0 commit comments

Comments
 (0)