@@ -20,7 +20,7 @@ def gpt2():
20
20
# The pattern in the original GPT-2 release is:
21
21
# r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
22
22
# This is equivalent, but executes faster:
23
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
23
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
24
24
"mergeable_ranks" : mergeable_ranks ,
25
25
"special_tokens" : {ENDOFTEXT : 50256 },
26
26
}
@@ -34,7 +34,7 @@ def r50k_base():
34
34
return {
35
35
"name" : "r50k_base" ,
36
36
"explicit_n_vocab" : 50257 ,
37
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
37
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
38
38
"mergeable_ranks" : mergeable_ranks ,
39
39
"special_tokens" : {ENDOFTEXT : 50256 },
40
40
}
@@ -48,7 +48,7 @@ def p50k_base():
48
48
return {
49
49
"name" : "p50k_base" ,
50
50
"explicit_n_vocab" : 50281 ,
51
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
51
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
52
52
"mergeable_ranks" : mergeable_ranks ,
53
53
"special_tokens" : {ENDOFTEXT : 50256 },
54
54
}
@@ -62,7 +62,7 @@ def p50k_edit():
62
62
special_tokens = {ENDOFTEXT : 50256 , FIM_PREFIX : 50281 , FIM_MIDDLE : 50282 , FIM_SUFFIX : 50283 }
63
63
return {
64
64
"name" : "p50k_edit" ,
65
- "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ,
65
+ "pat_str" : r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]++ |\s+(?!\S)|\s+""" ,
66
66
"mergeable_ranks" : mergeable_ranks ,
67
67
"special_tokens" : special_tokens ,
68
68
}
0 commit comments