@@ -20,7 +20,7 @@ def gpt2():
20
20
# The pattern in the original GPT-2 release is:
21
21
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22
22
# This is equivalent, but executes faster:
23
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
23
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
24
24
"mergeable_ranks" : mergeable_ranks ,
25
25
"special_tokens" : {ENDOFTEXT : 50256 },
26
26
}
@@ -34,7 +34,7 @@ def r50k_base():
34
34
return {
35
35
"name" : "r50k_base" ,
36
36
"explicit_n_vocab" : 50257 ,
37
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
37
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
38
38
"mergeable_ranks" : mergeable_ranks ,
39
39
"special_tokens" : {ENDOFTEXT : 50256 },
40
40
}
@@ -48,7 +48,7 @@ def p50k_base():
48
48
return {
49
49
"name" : "p50k_base" ,
50
50
"explicit_n_vocab" : 50281 ,
51
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
51
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
52
52
"mergeable_ranks" : mergeable_ranks ,
53
53
"special_tokens" : {ENDOFTEXT : 50256 },
54
54
}
@@ -62,7 +62,7 @@ def p50k_edit():
62
62
special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
63
63
return {
64
64
"name" : "p50k_edit" ,
65
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
65
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}++ | ?\p{N}++ | ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+ +""" ,
66
66
"mergeable_ranks" : mergeable_ranks ,
67
67
"special_tokens" : special_tokens ,
68
68
}
@@ -82,7 +82,7 @@ def cl100k_base():
82
82
}
83
83
return {
84
84
"name" : "cl100k_base" ,
85
- "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" ,
85
+ "pat_str" : r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}++ |\p{N}{1,3}+ | ?[^\s\p{L}\p{N}]++[\r\n]*+ |\s*[\r\n]|\s+(?!\S)|\s+ +""" ,
86
86
"mergeable_ranks" : mergeable_ranks ,
87
87
"special_tokens" : special_tokens ,
88
88
}
0 commit comments