Skip to content

Commit 290a8f5

Browse files
Conarnaragrima1304
authored andcommitted
Added JS bindings for tokenizers library (pytorch#13566)
### Summary Added JavaScript bindings for the tokenizer library so that we can use them to run LLMs in a web browser. ### Test plan I will add end to end tests later.
1 parent f9593d2 commit 290a8f5

File tree

5 files changed

+306
-0
lines changed

5 files changed

+306
-0
lines changed

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -880,6 +880,10 @@ if(EXECUTORCH_BUILD_WASM)
880880
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm)
881881
endif()
882882

883+
if(EXECUTORCH_BUILD_TOKENIZERS_WASM)
884+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm/tokenizers)
885+
endif()
886+
883887
if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
884888
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
885889
list(APPEND _executorch_extensions extension_training)
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Please this file formatted by running:
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
12+
cmake_minimum_required(VERSION 3.29)
13+
14+
if(NOT CMAKE_CXX_STANDARD)
15+
set(CMAKE_CXX_STANDARD 17)
16+
endif()
17+
18+
if(NOT EMSCRIPTEN)
19+
message(FATAL_ERROR "Emscripten is required to build this target")
20+
endif()
21+
22+
# Source root directory for executorch.
23+
if(NOT EXECUTORCH_ROOT)
24+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
25+
endif()
26+
27+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
28+
set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
29+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
30+
31+
set(link_libraries)
32+
list(APPEND link_libraries embind tokenizers::tokenizers)
33+
34+
add_library(tokenizers_wasm OBJECT tokenizers.cpp)
35+
36+
target_compile_options(tokenizers_wasm PUBLIC ${_common_compile_options})
37+
target_include_directories(
38+
tokenizers_wasm PUBLIC ${_common_include_directories}
39+
)
40+
41+
target_link_libraries(tokenizers_wasm PUBLIC ${link_libraries})
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# Tokenizers JavaScript Bindings
2+
3+
This directory contains the JavaScript bindings for the [LLM Tokenizers](../../llm/README.md#tokenizer) library.
4+
5+
## Building
6+
7+
To build Tokenizers for Wasm, make sure to use the `emcmake cmake` command and to have `EXECUTORCH_BUILD_TOKENIZERS_WASM` and `EXECUTORCH_BUILD_EXTENSION_LLM` enabled. For example:
8+
9+
```bash
10+
# Configure the build with the Emscripten environment variables
11+
emcmake cmake . -DEXECUTORCH_BUILD_TOKENIZERS_WASM=ON \
12+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
13+
-DCMAKE_BUILD_TYPE=Release \
14+
-Bcmake-out-wasm
15+
16+
# Build the Wasm extension
17+
cmake --build cmake-out-wasm --target tokenizers_wasm -j32
18+
```
19+
20+
Emscripten modules are loaded into the global `Module` object by default. This means you cannot have multiple modules in the same page. If you are also using the ExecuTorch Wasm bindings, it is recommended to use the `MODULARIZE` option to avoid conflicts.
21+
22+
In your CMakeLists.txt, add the following lines:
23+
24+
```cmake
25+
add_executable(tokenizers_wasm_lib) # Emscripten outputs this as a JS and Wasm file
26+
target_link_libraries(tokenizers_wasm_lib PRIVATE tokenizers_wasm)
27+
target_link_options(tokenizers_wasm_lib PRIVATE -sMODULARIZE=1 -sEXPORT_NAME=loadTokenizers) # If EXPORT_NAME is not set, the default is Module, which will conflict with ExecuTorch
28+
```
29+
30+
You can then access the module with `mod = await loadTokenizers();` or `loadTokenizers().then(mod => { /* ... */ });`.
31+
32+
For example, to load the module in a HTML file, you can use the following:
33+
34+
```html
35+
<script src="tokenizers_wasm_lib.js"></script>
36+
<script>
37+
var Module = {
38+
onRuntimeInitialized: async function() {
39+
// Load Tokenizers Module after ExecuTorch Module is initialized
40+
const tokenizersModule = await loadTokenizers();
41+
const sp = new tokenizersModule.SpTokenizer();
42+
// ...
43+
}
44+
}
45+
</script>
46+
<script src="executorch_wasm_lib.js"></script>
47+
```
48+
49+
You can read more about Modularized Output in the [Emscripten docs](https://emscripten.org/docs/compiling/Modularized-Output.html).
50+
51+
## JavaScript API
52+
53+
### Supported Tokenizers
54+
- `HFTokenizer`
55+
- `SpTokenizer`
56+
- `Tiktoken`
57+
- `Llama2cTokenizer`
58+
59+
### Tokenizer API
60+
- `load(data)`: Load tokenizer data from a file or a buffer.
61+
- `encode(text, bos=0, eos=0)`: Encode a string into a list of tokens with the number of bos tokens to prepend and eos tokens to append to the result.
62+
- `decode(tokens)`: Decode a list of tokens into a string.
63+
- `vocabSize`: The number of tokens in the vocabulary.
64+
- `eosTok`: The end-of-sequence token.
65+
- `bosTok`: The begining-of-sequence token.
66+
- `isLoaded`: Whether the tokenizer is loaded.
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <emscripten.h>
10+
#include <emscripten/bind.h>
11+
#include <executorch/runtime/platform/compiler.h>
12+
#include <pytorch/tokenizers/hf_tokenizer.h>
13+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
14+
#include <pytorch/tokenizers/sentencepiece.h>
15+
#include <pytorch/tokenizers/tekken.h>
16+
#include <pytorch/tokenizers/tiktoken.h>
17+
#include <cstdio>
18+
19+
using namespace emscripten;
20+
using tokenizers::Error;
21+
using tokenizers::HFTokenizer;
22+
using tokenizers::Llama2cTokenizer;
23+
using tokenizers::SPTokenizer;
24+
using tokenizers::Tekken;
25+
using tokenizers::Tiktoken;
26+
using tokenizers::Tokenizer;
27+
28+
#define THROW_JS_ERROR(errorType, message, ...) \
29+
({ \
30+
char msg_buf[256]; \
31+
int len = snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
32+
if (len < sizeof(msg_buf)) { \
33+
EM_ASM(throw new errorType(UTF8ToString($0)), msg_buf); \
34+
} else { \
35+
std::string msg; \
36+
msg.resize(len); \
37+
snprintf(&msg[0], len + 1, message, ##__VA_ARGS__); \
38+
EM_ASM(throw new errorType(UTF8ToString($0)), msg.c_str()); \
39+
} \
40+
__builtin_unreachable(); \
41+
})
42+
43+
/// Throws a JavaScript Error with the provided message if `error` is not `Ok`.
44+
#define THROW_IF_ERROR(error, message, ...) \
45+
({ \
46+
if ET_UNLIKELY ((error) != Error::Ok) { \
47+
THROW_JS_ERROR(Error, message, ##__VA_ARGS__); \
48+
} \
49+
})
50+
51+
namespace executorch {
52+
namespace extension {
53+
namespace wasm {
54+
namespace tokenizers {
55+
56+
namespace {
57+
58+
#define JS_FORALL_TOKENIZERS(_) \
59+
_(HFTokenizer) \
60+
_(Tiktoken) \
61+
_(SPTokenizer) \
62+
_(Llama2cTokenizer) \
63+
_(Tekken)
64+
65+
/**
66+
* EXPERIMENTAL: JavaScript wrapper for Tokenizer.
67+
*/
68+
template <typename T>
69+
class ET_EXPERIMENTAL JsTokenizer {
70+
static_assert(
71+
std::is_base_of<Tokenizer, T>::value,
72+
"T must be a subclass of Tokenizer");
73+
74+
public:
75+
JsTokenizer() : tokenizer_(std::make_unique<T>()) {}
76+
JsTokenizer(const JsTokenizer&) = delete;
77+
JsTokenizer& operator=(const JsTokenizer&) = delete;
78+
JsTokenizer(JsTokenizer&&) = default;
79+
JsTokenizer& operator=(JsTokenizer&&) = default;
80+
81+
void load_from_uint8_array(val data) {
82+
// Tokenizer API can't load from a buffer, so we need to write the buffer to
83+
// a temporary file and load from there.
84+
static const char* tmpFileName = "tokenizer_input_buffer.tmp";
85+
FILE* tmp_file = fopen(tmpFileName, "wb");
86+
if (tmp_file == nullptr) {
87+
THROW_JS_ERROR(Error, "Failed to open file");
88+
}
89+
size_t length = data["length"].as<size_t>();
90+
std::vector<uint8_t> buffer(length);
91+
val memory_view = val(typed_memory_view(length, buffer.data()));
92+
memory_view.call<void>("set", data);
93+
fwrite(buffer.data(), sizeof(uint8_t), length, tmp_file);
94+
fclose(tmp_file);
95+
Error error = tokenizer_->load(tmpFileName);
96+
THROW_IF_ERROR(error, "Failed to load tokenizer");
97+
remove(tmpFileName);
98+
}
99+
100+
void load(val data) {
101+
if (data.isString()) {
102+
Error error = tokenizer_->load(data.as<std::string>());
103+
THROW_IF_ERROR(error, "Failed to load tokenizer");
104+
} else if (data.instanceof (val::global("Uint8Array"))) {
105+
return load_from_uint8_array(data);
106+
} else if (data.instanceof (val::global("ArrayBuffer"))) {
107+
return load_from_uint8_array(val::global("Uint8Array").new_(data));
108+
} else {
109+
THROW_JS_ERROR(
110+
TypeError,
111+
"Unsupported data type: %s",
112+
data.typeOf().as<std::string>().c_str());
113+
}
114+
}
115+
116+
val encode(const std::string& text, int8_t bos, int8_t eos) const {
117+
auto res = tokenizer_->encode(text, bos, eos);
118+
THROW_IF_ERROR(res.error(), "Failed to encode text");
119+
return val::array(res.get().begin(), res.get().end());
120+
}
121+
122+
val encode(const std::string& text, int8_t bos) const {
123+
return encode(text, bos, 0);
124+
}
125+
126+
val encode(const std::string& text) const {
127+
return encode(text, 0);
128+
}
129+
130+
std::string decode(uint64_t prev, uint64_t current) const {
131+
auto res = tokenizer_->decode(prev, current);
132+
THROW_IF_ERROR(res.error(), "Failed to decode token");
133+
return res.get();
134+
}
135+
136+
uint64_t vocab_size() const {
137+
return tokenizer_->vocab_size();
138+
}
139+
140+
uint64_t bos_tok() const {
141+
return tokenizer_->bos_tok();
142+
}
143+
144+
uint64_t eos_tok() const {
145+
return tokenizer_->eos_tok();
146+
}
147+
148+
bool is_loaded() const {
149+
return tokenizer_->is_loaded();
150+
}
151+
152+
private:
153+
std::unique_ptr<T> tokenizer_;
154+
};
155+
156+
} // namespace
157+
158+
EMSCRIPTEN_BINDINGS(TokenizerModule) {
159+
#define JS_BIND_TOKENIZER(NAME) \
160+
class_<JsTokenizer<NAME>>(#NAME) \
161+
.constructor<>() \
162+
.function("load", &JsTokenizer<NAME>::load) \
163+
.function( \
164+
"encode", \
165+
select_overload<val(const std::string&) const>( \
166+
&JsTokenizer<NAME>::encode)) \
167+
.function( \
168+
"encode", \
169+
select_overload<val(const std::string&, int8_t) const>( \
170+
&JsTokenizer<NAME>::encode)) \
171+
.function( \
172+
"encode", \
173+
select_overload<val(const std::string&, int8_t, int8_t) const>( \
174+
&JsTokenizer<NAME>::encode)) \
175+
.function("decode", &JsTokenizer<NAME>::decode) \
176+
.property("vocabSize", &JsTokenizer<NAME>::vocab_size) \
177+
.property("bosTok", &JsTokenizer<NAME>::bos_tok) \
178+
.property("eosTok", &JsTokenizer<NAME>::eos_tok) \
179+
.property("isLoaded", &JsTokenizer<NAME>::is_loaded);
180+
JS_FORALL_TOKENIZERS(JS_BIND_TOKENIZER)
181+
}
182+
183+
} // namespace tokenizers
184+
} // namespace wasm
185+
} // namespace extension
186+
} // namespace executorch

tools/cmake/preset/default.cmake

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,10 @@ define_overridable_option(
155155
define_overridable_option(
156156
EXECUTORCH_BUILD_WASM "Build the ExecuTorch JavaScript API" BOOL OFF
157157
)
158+
define_overridable_option(
159+
EXECUTORCH_BUILD_TOKENIZERS_WASM "Build the JavaScript Tokenizers API" BOOL
160+
OFF
161+
)
158162

159163
if(EXECUTORCH_BUILD_ARM_BAREMETAL)
160164
set(_default_executorch_build_pthreadpool OFF)
@@ -333,6 +337,11 @@ check_required_options_on(
333337
EXECUTORCH_BUILD_EXTENSION_TENSOR
334338
)
335339

340+
check_required_options_on(
341+
IF_ON EXECUTORCH_BUILD_TOKENIZERS_WASM REQUIRES
342+
EXECUTORCH_BUILD_EXTENSION_LLM
343+
)
344+
336345
if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
337346
message(
338347
FATAL_ERROR

0 commit comments

Comments
 (0)