-
Notifications
You must be signed in to change notification settings - Fork 651
Added JS bindings for tokenizers library #13566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
# Please this file formatted by running: | ||
# ~~~ | ||
# cmake-format -i CMakeLists.txt | ||
# ~~~ | ||
|
||
cmake_minimum_required(VERSION 3.29) | ||
|
||
if(NOT CMAKE_CXX_STANDARD) | ||
set(CMAKE_CXX_STANDARD 17) | ||
endif() | ||
|
||
if(NOT EMSCRIPTEN) | ||
message(FATAL_ERROR "Emscripten is required to build this target") | ||
endif() | ||
|
||
# Source root directory for executorch. | ||
if(NOT EXECUTORCH_ROOT) | ||
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) | ||
endif() | ||
|
||
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) | ||
set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror) | ||
set(_common_include_directories ${EXECUTORCH_ROOT}/..) | ||
|
||
set(link_libraries) | ||
list(APPEND link_libraries embind tokenizers::tokenizers) | ||
|
||
add_library(tokenizers_wasm OBJECT tokenizers.cpp) | ||
|
||
target_compile_options(tokenizers_wasm PUBLIC ${_common_compile_options}) | ||
target_include_directories( | ||
tokenizers_wasm PUBLIC ${_common_include_directories} | ||
) | ||
|
||
target_link_libraries(tokenizers_wasm PUBLIC ${link_libraries}) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Tokenizers JavaScript Bindings | ||
|
||
This directory contains the JavaScript bindings for the [LLM Tokenizers](../../llm/README.md#tokenizer) library. | ||
|
||
## Building | ||
|
||
To build Tokenizers for Wasm, make sure to use the `emcmake cmake` command and to have `EXECUTORCH_BUILD_TOKENIZERS_WASM` and `EXECUTORCH_BUILD_EXTENSION_LLM` enabled. For example: | ||
|
||
```bash | ||
# Configure the build with the Emscripten environment variables | ||
emcmake cmake . -DEXECUTORCH_BUILD_TOKENIZERS_WASM=ON \ | ||
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ | ||
-DCMAKE_BUILD_TYPE=Release \ | ||
-Bcmake-out-wasm | ||
|
||
# Build the Wasm extension | ||
cmake --build cmake-out-wasm --target tokenizers_wasm -j32 | ||
``` | ||
|
||
Emscripten modules are loaded into the global `Module` object by default. This means you cannot have multiple modules in the same page. If you are also using the ExecuTorch Wasm bindings, it is recommended to use the `MODULARIZE` option to avoid conflicts. | ||
|
||
In your CMakeLists.txt, add the following lines: | ||
|
||
```cmake | ||
add_executable(tokenizers_wasm_lib) # Emscripten outputs this as a JS and Wasm file | ||
target_link_libraries(tokenizers_wasm_lib PRIVATE tokenizers_wasm) | ||
target_link_options(tokenizers_wasm_lib PRIVATE -sMODULARIZE=1 -sEXPORT_NAME=loadTokenizers) # If EXPORT_NAME is not set, the default is Module, which will conflict with ExecuTorch | ||
``` | ||
|
||
You can then access the module with `mod = await loadTokenizers();` or `loadTokenizers().then(mod => { /* ... */ });`. | ||
|
||
For example, to load the module in a HTML file, you can use the following: | ||
|
||
```html | ||
<script src="tokenizers_wasm_lib.js"></script> | ||
<script> | ||
var Module = { | ||
onRuntimeInitialized: async function() { | ||
// Load Tokenizers Module after ExecuTorch Module is initialized | ||
const tokenizersModule = await loadTokenizers(); | ||
const sp = new tokenizersModule.SpTokenizer(); | ||
// ... | ||
} | ||
} | ||
</script> | ||
<script src="executorch_wasm_lib.js"></script> | ||
``` | ||
|
||
You can read more about Modularized Output in the [Emscripten docs](https://emscripten.org/docs/compiling/Modularized-Output.html). | ||
|
||
## JavaScript API | ||
|
||
### Supported Tokenizers | ||
- `HFTokenizer` | ||
- `SpTokenizer` | ||
- `Tiktoken` | ||
- `Llama2cTokenizer` | ||
|
||
### Tokenizer API | ||
- `load(data)`: Load tokenizer data from a file or a buffer. | ||
- `encode(text, bos=0, eos=0)`: Encode a string into a list of tokens with the number of bos tokens to prepend and eos tokens to append to the result. | ||
- `decode(tokens)`: Decode a list of tokens into a string. | ||
- `vocabSize`: The number of tokens in the vocabulary. | ||
- `eosTok`: The end-of-sequence token. | ||
- `bosTok`: The begining-of-sequence token. | ||
- `isLoaded`: Whether the tokenizer is loaded. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* All rights reserved. | ||
* | ||
* This source code is licensed under the BSD-style license found in the | ||
* LICENSE file in the root directory of this source tree. | ||
*/ | ||
|
||
#include <emscripten.h> | ||
#include <emscripten/bind.h> | ||
#include <executorch/runtime/platform/compiler.h> | ||
#include <pytorch/tokenizers/hf_tokenizer.h> | ||
#include <pytorch/tokenizers/llama2c_tokenizer.h> | ||
#include <pytorch/tokenizers/sentencepiece.h> | ||
#include <pytorch/tokenizers/tekken.h> | ||
#include <pytorch/tokenizers/tiktoken.h> | ||
#include <cstdio> | ||
|
||
using namespace emscripten; | ||
using tokenizers::Error; | ||
using tokenizers::HFTokenizer; | ||
using tokenizers::Llama2cTokenizer; | ||
using tokenizers::SPTokenizer; | ||
using tokenizers::Tekken; | ||
using tokenizers::Tiktoken; | ||
using tokenizers::Tokenizer; | ||
|
||
#define THROW_JS_ERROR(errorType, message, ...) \ | ||
({ \ | ||
char msg_buf[256]; \ | ||
int len = snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \ | ||
if (len < sizeof(msg_buf)) { \ | ||
EM_ASM(throw new errorType(UTF8ToString($0)), msg_buf); \ | ||
} else { \ | ||
std::string msg; \ | ||
msg.resize(len); \ | ||
snprintf(&msg[0], len + 1, message, ##__VA_ARGS__); \ | ||
EM_ASM(throw new errorType(UTF8ToString($0)), msg.c_str()); \ | ||
} \ | ||
__builtin_unreachable(); \ | ||
}) | ||
|
||
/// Throws a JavaScript Error with the provided message if `error` is not `Ok`. | ||
#define THROW_IF_ERROR(error, message, ...) \ | ||
({ \ | ||
if ET_UNLIKELY ((error) != Error::Ok) { \ | ||
THROW_JS_ERROR(Error, message, ##__VA_ARGS__); \ | ||
} \ | ||
}) | ||
|
||
namespace executorch { | ||
namespace extension { | ||
namespace wasm { | ||
namespace tokenizers { | ||
|
||
namespace { | ||
|
||
#define JS_FORALL_TOKENIZERS(_) \ | ||
_(HFTokenizer) \ | ||
_(Tiktoken) \ | ||
_(SPTokenizer) \ | ||
_(Llama2cTokenizer) \ | ||
_(Tekken) | ||
|
||
/** | ||
* EXPERIMENTAL: JavaScript wrapper for Tokenizer. | ||
*/ | ||
template <typename T> | ||
class ET_EXPERIMENTAL JsTokenizer { | ||
static_assert( | ||
std::is_base_of<Tokenizer, T>::value, | ||
"T must be a subclass of Tokenizer"); | ||
|
||
public: | ||
JsTokenizer() : tokenizer_(std::make_unique<T>()) {} | ||
JsTokenizer(const JsTokenizer&) = delete; | ||
JsTokenizer& operator=(const JsTokenizer&) = delete; | ||
JsTokenizer(JsTokenizer&&) = default; | ||
JsTokenizer& operator=(JsTokenizer&&) = default; | ||
|
||
void load_from_uint8_array(val data) { | ||
// Tokenizer API can't load from a buffer, so we need to write the buffer to | ||
// a temporary file and load from there. | ||
static const char* tmpFileName = "tokenizer_input_buffer.tmp"; | ||
FILE* tmp_file = fopen(tmpFileName, "wb"); | ||
if (tmp_file == nullptr) { | ||
THROW_JS_ERROR(Error, "Failed to open file"); | ||
} | ||
size_t length = data["length"].as<size_t>(); | ||
std::vector<uint8_t> buffer(length); | ||
val memory_view = val(typed_memory_view(length, buffer.data())); | ||
memory_view.call<void>("set", data); | ||
fwrite(buffer.data(), sizeof(uint8_t), length, tmp_file); | ||
fclose(tmp_file); | ||
Error error = tokenizer_->load(tmpFileName); | ||
THROW_IF_ERROR(error, "Failed to load tokenizer"); | ||
remove(tmpFileName); | ||
} | ||
|
||
void load(val data) { | ||
if (data.isString()) { | ||
Error error = tokenizer_->load(data.as<std::string>()); | ||
THROW_IF_ERROR(error, "Failed to load tokenizer"); | ||
} else if (data.instanceof (val::global("Uint8Array"))) { | ||
return load_from_uint8_array(data); | ||
} else if (data.instanceof (val::global("ArrayBuffer"))) { | ||
return load_from_uint8_array(val::global("Uint8Array").new_(data)); | ||
} else { | ||
THROW_JS_ERROR( | ||
TypeError, | ||
"Unsupported data type: %s", | ||
data.typeOf().as<std::string>().c_str()); | ||
} | ||
} | ||
|
||
val encode(const std::string& text, int8_t bos, int8_t eos) const { | ||
auto res = tokenizer_->encode(text, bos, eos); | ||
THROW_IF_ERROR(res.error(), "Failed to encode text"); | ||
return val::array(res.get().begin(), res.get().end()); | ||
} | ||
|
||
val encode(const std::string& text, int8_t bos) const { | ||
return encode(text, bos, 0); | ||
} | ||
|
||
val encode(const std::string& text) const { | ||
return encode(text, 0); | ||
} | ||
|
||
std::string decode(uint64_t prev, uint64_t current) const { | ||
auto res = tokenizer_->decode(prev, current); | ||
THROW_IF_ERROR(res.error(), "Failed to decode token"); | ||
return res.get(); | ||
} | ||
|
||
uint64_t vocab_size() const { | ||
return tokenizer_->vocab_size(); | ||
} | ||
|
||
uint64_t bos_tok() const { | ||
return tokenizer_->bos_tok(); | ||
} | ||
|
||
uint64_t eos_tok() const { | ||
return tokenizer_->eos_tok(); | ||
} | ||
|
||
bool is_loaded() const { | ||
return tokenizer_->is_loaded(); | ||
} | ||
|
||
private: | ||
std::unique_ptr<T> tokenizer_; | ||
}; | ||
|
||
} // namespace | ||
|
||
EMSCRIPTEN_BINDINGS(TokenizerModule) { | ||
#define JS_BIND_TOKENIZER(NAME) \ | ||
class_<JsTokenizer<NAME>>(#NAME) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you measure the codesize of this lib. I'm curious how bad it is having to individually bind each template instantiation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 102K js, 1.4M wasm |
||
.constructor<>() \ | ||
.function("load", &JsTokenizer<NAME>::load) \ | ||
.function( \ | ||
"encode", \ | ||
select_overload<val(const std::string&) const>( \ | ||
&JsTokenizer<NAME>::encode)) \ | ||
.function( \ | ||
"encode", \ | ||
select_overload<val(const std::string&, int8_t) const>( \ | ||
&JsTokenizer<NAME>::encode)) \ | ||
.function( \ | ||
"encode", \ | ||
select_overload<val(const std::string&, int8_t, int8_t) const>( \ | ||
&JsTokenizer<NAME>::encode)) \ | ||
.function("decode", &JsTokenizer<NAME>::decode) \ | ||
.property("vocabSize", &JsTokenizer<NAME>::vocab_size) \ | ||
.property("bosTok", &JsTokenizer<NAME>::bos_tok) \ | ||
.property("eosTok", &JsTokenizer<NAME>::eos_tok) \ | ||
.property("isLoaded", &JsTokenizer<NAME>::is_loaded); | ||
JS_FORALL_TOKENIZERS(JS_BIND_TOKENIZER) | ||
} | ||
|
||
} // namespace tokenizers | ||
} // namespace wasm | ||
} // namespace extension | ||
} // namespace executorch |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we have too many cmake options lmao.