Skip to content

Commit 33f5b35

Browse files
author
wozulong
committed
add gpt-4o
Signed-off-by: wozulong <>
1 parent 475cdcd commit 33f5b35

11 files changed

+124
-44
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.idea/

README.md

+14-9
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ package main
4444

4545
import (
4646
"fmt"
47-
"github.com/pkoukk/tiktoken-go"
47+
"github.com/linux-do/tiktoken-go"
4848
)
4949

5050
func main() {
@@ -76,7 +76,7 @@ package main
7676

7777
import (
7878
"fmt"
79-
"github.com/pkoukk/tiktoken-go"
79+
"github.com/linux-do/tiktoken-go"
8080
)
8181

8282
func main() {
@@ -117,7 +117,7 @@ package main
117117
import (
118118
"fmt"
119119

120-
"github.com/pkoukk/tiktoken-go"
120+
"github.com/linux-do/tiktoken-go"
121121
"github.com/sashabaranov/go-openai"
122122
)
123123

@@ -174,17 +174,20 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
174174

175175

176176
# Available Encodings
177-
| Encoding name | OpenAI models |
178-
| ----------------------- | ---------------------------------------------------- |
179-
| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` |
180-
| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` |
181-
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |
177+
| Encoding name | OpenAI models |
178+
|-------------------------|------------------------------------------------------|
179+
| `o200k_base` | `gpt-4o` |
180+
| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` |
181+
| `cl100k_base` | `text-embedding-3-large`, `text-embedding-3-small` |
182+
| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` |
183+
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |
182184

183185

184186

185187
# Available Models
186188
| Model name | OpenAI models |
187-
| ---------------------------- | ------------- |
189+
|------------------------------| ------------- |
190+
| gpt-4o-* | o200k_base |
188191
| gpt-4-* | cl100k_base |
189192
| gpt-3.5-turbo-* | cl100k_base |
190193
| gpt-4 | cl100k_base |
@@ -208,6 +211,8 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
208211
| text-davinci-edit-001 | p50k_edit |
209212
| code-davinci-edit-001 | p50k_edit |
210213
| text-embedding-ada-002 | cl100k_base |
214+
| text-embedding-3-small | cl100k_base |
215+
| text-embedding-3-large | cl100k_base |
211216
| text-similarity-davinci-001 | r50k_base |
212217
| text-similarity-curie-001 | r50k_base |
213218
| text-similarity-babbage-001 | r50k_base |

README_zh-hans.md

+18-13
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ package main
4343

4444
import (
4545
"fmt"
46-
"github.com/pkoukk/tiktoken-go"
46+
"github.com/linux-do/tiktoken-go"
4747
)
4848

4949
func main() {
@@ -75,7 +75,7 @@ package main
7575

7676
import (
7777
"fmt"
78-
"github.com/pkoukk/tiktoken-go"
78+
"github.com/linux-do/tiktoken-go"
7979
)
8080

8181
func main() {
@@ -113,7 +113,7 @@ package main
113113
import (
114114
"fmt"
115115

116-
"github.com/pkoukk/tiktoken-go"
116+
"github.com/linux-do/tiktoken-go"
117117
"github.com/sashabaranov/go-openai"
118118
)
119119

@@ -166,21 +166,24 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
166166
}
167167
```
168168

169-
# available encodings
170-
| Encoding name | OpenAI models |
171-
| ----------------------- | ---------------------------------------------------- |
172-
| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` |
173-
| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` |
174-
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |
169+
# Available Encodings
170+
| Encoding name | OpenAI models |
171+
|-------------------------|------------------------------------------------------|
172+
| `o200k_base` | `gpt-4o` |
173+
| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` |
174+
| `cl100k_base` | `text-embedding-3-large`, `text-embedding-3-small` |
175+
| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` |
176+
| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |
175177

176178

177-
# available models
179+
# Available Models
178180
| Model name | OpenAI models |
179-
| ---------------------------- | ------------- |
180-
| gpt-4 | cl100k_base |
181+
|------------------------------| ------------- |
182+
| gpt-4o-* | o200k_base |
181183
| gpt-4-* | cl100k_base |
182-
| gpt-3.5-turbo | cl100k_base |
183184
| gpt-3.5-turbo-* | cl100k_base |
185+
| gpt-4 | cl100k_base |
186+
| gpt-3.5-turbo | cl100k_base |
184187
| text-davinci-003 | p50k_base |
185188
| text-davinci-002 | p50k_base |
186189
| text-davinci-001 | r50k_base |
@@ -200,6 +203,8 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
200203
| text-davinci-edit-001 | p50k_edit |
201204
| code-davinci-edit-001 | p50k_edit |
202205
| text-embedding-ada-002 | cl100k_base |
206+
| text-embedding-3-small | cl100k_base |
207+
| text-embedding-3-large | cl100k_base |
203208
| text-similarity-davinci-001 | r50k_base |
204209
| text-similarity-curie-001 | r50k_base |
205210
| text-similarity-babbage-001 | r50k_base |

doc/test_result.md

+12
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,54 @@
11
# Encoding Test Result
22
| python tiktoken | golang tiktoken-go |
33
| :------------------------------------------------------- | :------------------------------------------------------- |
4+
| text: hallo world!, encoding: o200k_base, token: 4 | text: hallo world!, encoding: o200k_base, token: 4 |
45
| text: hallo world!, encoding: cl100k_base, token: 4 | text: hallo world!, encoding: cl100k_base, token: 4 |
56
| text: hallo world!, encoding: p50k_base, token: 4 | text: hallo world!, encoding: p50k_base, token: 4 |
67
| text: hallo world!, encoding: r50k_base, token: 4 | text: hallo world!, encoding: r50k_base, token: 4 |
8+
| text: 你好世界!, encoding: o200k_base, token: 3 | text: 你好世界!, encoding: o200k_base, token: 3 |
79
| text: 你好世界!, encoding: cl100k_base, token: 6 | text: 你好世界!, encoding: cl100k_base, token: 6 |
810
| text: 你好世界!, encoding: p50k_base, token: 11 | text: 你好世界!, encoding: p50k_base, token: 11 |
911
| text: 你好世界!, encoding: r50k_base, token: 11 | text: 你好世界!, encoding: r50k_base, token: 11 |
12+
| text: こんにちは世界!, encoding: o200k_base, token: 3 | text: こんにちは世界!, encoding: cl100k_base, token: 3 |
1013
| text: こんにちは世界!, encoding: cl100k_base, token: 5 | text: こんにちは世界!, encoding: cl100k_base, token: 5 |
1114
| text: こんにちは世界!, encoding: p50k_base, token: 13 | text: こんにちは世界!, encoding: p50k_base, token: 13 |
1215
| text: こんにちは世界!, encoding: r50k_base, token: 13 | text: こんにちは世界!, encoding: r50k_base, token: 13 |
16+
| text: 안녕하세요 세계!, encoding: o200k_base, token: 4 | text: 안녕하세요 세계!, encoding: o200k_base, token: 4 |
1317
| text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 | text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 |
1418
| text: 안녕하세요 세계!, encoding: p50k_base, token: 21 | text: 안녕하세요 세계!, encoding: p50k_base, token: 21 |
1519
| text: 안녕하세요 세계!, encoding: r50k_base, token: 21 | text: 안녕하세요 세계!, encoding: r50k_base, token: 21 |
20+
| text: Привет мир!, encoding: o200k_base, token: 4 | text: Привет мир!, encoding: cl100k_base, token: 4 |
1621
| text: Привет мир!, encoding: cl100k_base, token: 6 | text: Привет мир!, encoding: cl100k_base, token: 6 |
1722
| text: Привет мир!, encoding: p50k_base, token: 12 | text: Привет мир!, encoding: p50k_base, token: 12 |
1823
| text: Привет мир!, encoding: r50k_base, token: 12 | text: Привет мир!, encoding: r50k_base, token: 12 |
24+
| text: ¡Hola mundo!, encoding: o200k_base, token: 4 | text: ¡Hola mundo!, encoding: o200k_base, token: 4 |
1925
| text: ¡Hola mundo!, encoding: cl100k_base, token: 4 | text: ¡Hola mundo!, encoding: cl100k_base, token: 4 |
2026
| text: ¡Hola mundo!, encoding: p50k_base, token: 7 | text: ¡Hola mundo!, encoding: p50k_base, token: 7 |
2127
| text: ¡Hola mundo!, encoding: r50k_base, token: 7 | text: ¡Hola mundo!, encoding: r50k_base, token: 7 |
28+
| text: Hallo Welt!, encoding: o200k_base, token: 3 | text: Hallo Welt!, encoding: o200k_base, token: 3 |
2229
| text: Hallo Welt!, encoding: cl100k_base, token: 3 | text: Hallo Welt!, encoding: cl100k_base, token: 3 |
2330
| text: Hallo Welt!, encoding: p50k_base, token: 5 | text: Hallo Welt!, encoding: p50k_base, token: 5 |
2431
| text: Hallo Welt!, encoding: r50k_base, token: 5 | text: Hallo Welt!, encoding: r50k_base, token: 5 |
32+
| text: Bonjour le monde!, encoding: o200k_base, token: 4 | text: Bonjour le monde!, encoding: o200k_base, token: 4 |
2533
| text: Bonjour le monde!, encoding: cl100k_base, token: 4 | text: Bonjour le monde!, encoding: cl100k_base, token: 4 |
2634
| text: Bonjour le monde!, encoding: p50k_base, token: 7 | text: Bonjour le monde!, encoding: p50k_base, token: 7 |
2735
| text: Bonjour le monde!, encoding: r50k_base, token: 7 | text: Bonjour le monde!, encoding: r50k_base, token: 7 |
36+
| text: Ciao mondo!, encoding: o200k_base, token: 4 | text: Ciao mondo!, encoding: o200k_base, token: 4 |
2837
| text: Ciao mondo!, encoding: cl100k_base, token: 4 | text: Ciao mondo!, encoding: cl100k_base, token: 4 |
2938
| text: Ciao mondo!, encoding: p50k_base, token: 5 | text: Ciao mondo!, encoding: p50k_base, token: 5 |
3039
| text: Ciao mondo!, encoding: r50k_base, token: 5 | text: Ciao mondo!, encoding: r50k_base, token: 5 |
40+
| text: Hej världen!, encoding: cl100k_base, token: 3 | text: Hej världen!, encoding: o200k_base, token: 3 |
3141
| text: Hej världen!, encoding: cl100k_base, token: 7 | text: Hej världen!, encoding: cl100k_base, token: 7 |
3242
| text: Hej världen!, encoding: p50k_base, token: 8 | text: Hej världen!, encoding: p50k_base, token: 8 |
3343
| text: Hej världen!, encoding: r50k_base, token: 8 | text: Hej världen!, encoding: r50k_base, token: 8 |
44+
| text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 |
3445
| text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 |
3546
| text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 |
3647
| text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 |
3748
| text: Hallo verden!, encoding: cl100k_base, token: 4 | text: Hallo verden!, encoding: cl100k_base, token: 4 |
3849
| text: Hallo verden!, encoding: p50k_base, token: 5 | text: Hallo verden!, encoding: p50k_base, token: 5 |
3950
| text: Hallo verden!, encoding: r50k_base, token: 5 | text: Hallo verden!, encoding: r50k_base, token: 5 |
51+
| text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 |
4052
| text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 |
4153
| text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 |
4254
| text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 |

encoding.go

+64-15
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package tiktoken
22

33
import (
44
"errors"
5+
"strings"
56
"sync"
67
)
78

@@ -12,40 +13,51 @@ const FIM_SUFFIX string = "<|fim_suffix|>"
1213
const ENDOFPROMPT string = "<|endofprompt|>"
1314

1415
const (
16+
MODEL_O200K_BASE string = "o200k_base"
1517
MODEL_CL100K_BASE string = "cl100k_base"
1618
MODEL_P50K_BASE string = "p50k_base"
1719
MODEL_P50K_EDIT string = "p50k_edit"
1820
MODEL_R50K_BASE string = "r50k_base"
21+
MODEL_GPT2 string = "gpt2"
1922
)
2023

2124
var MODEL_TO_ENCODING = map[string]string{
2225
// chat
26+
"gpt-4o": MODEL_O200K_BASE,
2327
"gpt-4": MODEL_CL100K_BASE,
2428
"gpt-3.5-turbo": MODEL_CL100K_BASE,
25-
// text
29+
"gpt-3.5": MODEL_CL100K_BASE, // Common shorthand
30+
"gpt-35-turbo": MODEL_CL100K_BASE, // Azure deployment name
31+
// base
32+
"davinci-002": MODEL_CL100K_BASE,
33+
"babbage-002": MODEL_CL100K_BASE,
34+
// embeddings
35+
"text-embedding-ada-002": MODEL_CL100K_BASE,
36+
"text-embedding-3-small": MODEL_CL100K_BASE,
37+
"text-embedding-3-large": MODEL_CL100K_BASE,
38+
// DEPRECATED MODELS
39+
// text (DEPRECATED)
2640
"text-davinci-003": MODEL_P50K_BASE,
2741
"text-davinci-002": MODEL_P50K_BASE,
28-
"text-davinci-001": MODEL_R50K_BASE,
29-
"text-curie-001": MODEL_R50K_BASE,
30-
"text-babbage-001": MODEL_R50K_BASE,
31-
"text-ada-001": MODEL_R50K_BASE,
32-
"davinci": MODEL_R50K_BASE,
33-
"curie": MODEL_R50K_BASE,
34-
"babbage": MODEL_R50K_BASE,
35-
"ada": MODEL_R50K_BASE,
36-
// code
42+
"text-davinci-001": MODEL_P50K_BASE,
43+
"text-curie-001": MODEL_P50K_BASE,
44+
"text-babbage-001": MODEL_P50K_BASE,
45+
"text-ada-001": MODEL_P50K_BASE,
46+
"davinci": MODEL_P50K_BASE,
47+
"curie": MODEL_P50K_BASE,
48+
"babbage": MODEL_P50K_BASE,
49+
"ada": MODEL_P50K_BASE,
50+
// code (DEPRECATED)
3751
"code-davinci-002": MODEL_P50K_BASE,
3852
"code-davinci-001": MODEL_P50K_BASE,
3953
"code-cushman-002": MODEL_P50K_BASE,
4054
"code-cushman-001": MODEL_P50K_BASE,
4155
"davinci-codex": MODEL_P50K_BASE,
4256
"cushman-codex": MODEL_P50K_BASE,
43-
// edit
57+
// edit (DEPRECATED)
4458
"text-davinci-edit-001": MODEL_P50K_EDIT,
4559
"code-davinci-edit-001": MODEL_P50K_EDIT,
46-
// embeddings
47-
"text-embedding-ada-002": MODEL_CL100K_BASE,
48-
// old embeddings
60+
// old embeddings (DEPRECATED)
4961
"text-similarity-davinci-001": MODEL_R50K_BASE,
5062
"text-similarity-curie-001": MODEL_R50K_BASE,
5163
"text-similarity-babbage-001": MODEL_R50K_BASE,
@@ -57,13 +69,21 @@ var MODEL_TO_ENCODING = map[string]string{
5769
"code-search-babbage-code-001": MODEL_R50K_BASE,
5870
"code-search-ada-code-001": MODEL_R50K_BASE,
5971
// open source
60-
"gpt2": "gpt2",
72+
"gpt2": MODEL_GPT2,
73+
"gpt-2": MODEL_GPT2, // Maintains consistency with gpt-4
6174
}
6275

6376
var MODEL_PREFIX_TO_ENCODING = map[string]string{
6477
// chat
78+
"gpt-4o-": MODEL_O200K_BASE, // e.g., gpt-4o-2024-05-13
6579
"gpt-4-": MODEL_CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k
6680
"gpt-3.5-turbo-": MODEL_CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc.
81+
"gpt-35-turbo-": MODEL_CL100K_BASE, // Azure deployment name
82+
// fine-tuned
83+
"ft:gpt-4": MODEL_CL100K_BASE,
84+
"ft:gpt-3.5-turbo": MODEL_CL100K_BASE,
85+
"ft:davinci-002": MODEL_CL100K_BASE,
86+
"ft:babbage-002": MODEL_CL100K_BASE,
6787
}
6888

6989
var encodingMap map[string]*Encoding
@@ -98,6 +118,8 @@ func getEncoding(encodingName string) (*Encoding, error) {
98118

99119
func initEncoding(encodingName string) (*Encoding, error) {
100120
switch encodingName {
121+
case MODEL_O200K_BASE:
122+
return o200k_base()
101123
case MODEL_CL100K_BASE:
102124
return cl100k_base()
103125
case MODEL_P50K_BASE:
@@ -111,6 +133,33 @@ func initEncoding(encodingName string) (*Encoding, error) {
111133
}
112134
}
113135

136+
func o200k_base() (*Encoding, error) {
137+
ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken")
138+
if err != nil {
139+
return nil, err
140+
}
141+
special_tokens := map[string]int{
142+
ENDOFTEXT: 199999,
143+
ENDOFPROMPT: 200018,
144+
}
145+
patStr := []string{
146+
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
147+
`[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`,
148+
`\p{N}{1,3}`,
149+
` ?[^\s\p{L}\p{N}]+[\r\n/]*`,
150+
`\s*[\r\n]+`,
151+
`\s+(?!\S)`,
152+
`\s+`,
153+
}
154+
155+
return &Encoding{
156+
Name: MODEL_O200K_BASE,
157+
PatStr: strings.Join(patStr, "|"),
158+
MergeableRanks: ranks,
159+
SpecialTokens: special_tokens,
160+
}, nil
161+
}
162+
114163
func cl100k_base() (*Encoding, error) {
115164
ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken")
116165
if err != nil {

go.mod

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
module github.com/pkoukk/tiktoken-go
1+
module github.com/linux-do/tiktoken-go
22

33
go 1.19
44

55
require (
6-
github.com/dlclark/regexp2 v1.10.0
7-
github.com/google/uuid v1.3.0
6+
github.com/dlclark/regexp2 v1.11.0
7+
github.com/google/uuid v1.6.0
88
github.com/stretchr/testify v1.8.2
99
)
1010

go.sum

+4
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
33
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
44
github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
55
github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
6+
github.com/dlclark/regexp2 v1.11.0 h1:G/nrcoOa7ZXlpoa/91N3X7mM3r8eIlMBBJZvsz/mxKI=
7+
github.com/dlclark/regexp2 v1.11.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
68
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
79
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
10+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
11+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
812
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
913
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
1014
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=

test/benchmark_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
"strings"
88
"testing"
99

10-
"github.com/pkoukk/tiktoken-go"
10+
"github.com/linux-do/tiktoken-go"
1111
)
1212

1313
func BenchmarkEncodingInFullLanguage(b *testing.B) {

test/test.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
hallo world!,你好世界!,こんにちは世界!,안녕하세요 세계!,Привет мир!,¡Hola mundo!,Hallo Welt!,Bonjour le monde!,Ciao mondo!,Hej världen!,Hallo wereld!,Hallo verden!,Hallo wereld!,Hallo verden!
2-
gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
3-
cl100k_base,p50k_base,r50k_base
2+
gpt-4o,gpt-4-turbo,gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001
3+
o200k_base,cl100k_base,p50k_base,r50k_base

test/token_num.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import (
77
"os"
88
"strings"
99

10-
"github.com/pkoukk/tiktoken-go"
10+
"github.com/linux-do/tiktoken-go"
1111
)
1212

1313
// main

tiktoken.go

+4
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ func (t *Tiktoken) Decode(tokens []int) string {
9090
return string(t.bpe.decodeNative(tokens))
9191
}
9292

93+
func (t *Tiktoken) EncoderName() string {
94+
return t.pbeEncoding.Name
95+
}
96+
9397
func (t *Tiktoken) SpecialTokenRegex(disallowedSpecialSet map[string]any) *regexp2.Regexp {
9498
specialRegexStrs := make([]string, 0, len(disallowedSpecialSet))
9599
for k := range disallowedSpecialSet {

0 commit comments

Comments
 (0)