Added JS bindings for tokenizers library (pytorch#13566)

Conarnar · agrima1304 · commit 290a8f5471fa · 2025-08-26T14:48:55.000+01:00
### Summary
Added JavaScript bindings for the tokenizer library so that we can use
them to run LLMs in a web browser.

### Test plan
I will add end to end tests later.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -880,6 +880,10 @@ if(EXECUTORCH_BUILD_WASM)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm)
 endif()
 
+if(EXECUTORCH_BUILD_TOKENIZERS_WASM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/wasm/tokenizers)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/training)
   list(APPEND _executorch_extensions extension_training)
diff --git a/extension/wasm/tokenizers/CMakeLists.txt b/extension/wasm/tokenizers/CMakeLists.txt
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Please this file formatted by running:
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.29)
+
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT EMSCRIPTEN)
+  message(FATAL_ERROR "Emscripten is required to build this target")
+endif()
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+set(_common_compile_options -Wno-deprecated-declarations -fPIC -Wall -Werror)
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+set(link_libraries)
+list(APPEND link_libraries embind tokenizers::tokenizers)
+
+add_library(tokenizers_wasm OBJECT tokenizers.cpp)
+
+target_compile_options(tokenizers_wasm PUBLIC ${_common_compile_options})
+target_include_directories(
+  tokenizers_wasm PUBLIC ${_common_include_directories}
+)
+
+target_link_libraries(tokenizers_wasm PUBLIC ${link_libraries})
diff --git a/extension/wasm/tokenizers/README.md b/extension/wasm/tokenizers/README.md
@@ -0,0 +1,66 @@
+# Tokenizers JavaScript Bindings
+
+This directory contains the JavaScript bindings for the [LLM Tokenizers](../../llm/README.md#tokenizer) library.
+
+## Building
+
+To build Tokenizers for Wasm, make sure to use the `emcmake cmake` command and to have `EXECUTORCH_BUILD_TOKENIZERS_WASM` and `EXECUTORCH_BUILD_EXTENSION_LLM` enabled. For example:
+
+```bash
+# Configure the build with the Emscripten environment variables
+emcmake cmake . -DEXECUTORCH_BUILD_TOKENIZERS_WASM=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+    -DCMAKE_BUILD_TYPE=Release \
+    -Bcmake-out-wasm
+
+# Build the Wasm extension
+cmake --build cmake-out-wasm --target tokenizers_wasm -j32
+```
+
+Emscripten modules are loaded into the global `Module` object by default. This means you cannot have multiple modules in the same page. If you are also using the ExecuTorch Wasm bindings, it is recommended to use the `MODULARIZE` option to avoid conflicts.
+
+In your CMakeLists.txt, add the following lines:
+
+```cmake
+add_executable(tokenizers_wasm_lib) # Emscripten outputs this as a JS and Wasm file
+target_link_libraries(tokenizers_wasm_lib PRIVATE tokenizers_wasm)
+target_link_options(tokenizers_wasm_lib PRIVATE -sMODULARIZE=1 -sEXPORT_NAME=loadTokenizers) # If EXPORT_NAME is not set, the default is Module, which will conflict with ExecuTorch
+```
+
+You can then access the module with `mod = await loadTokenizers();` or `loadTokenizers().then(mod => { /* ... */ });`.
+
+For example, to load the module in a HTML file, you can use the following:
+
+```html
+<script src="tokenizers_wasm_lib.js"></script>
+<script>
+  var Module = {
+    onRuntimeInitialized: async function() {
+      // Load Tokenizers Module after ExecuTorch Module is initialized
+      const tokenizersModule = await loadTokenizers();
+      const sp = new tokenizersModule.SpTokenizer();
+      // ...
+    }
+  }
+</script>
+<script src="executorch_wasm_lib.js"></script>
+```
+
+You can read more about Modularized Output in the [Emscripten docs](https://emscripten.org/docs/compiling/Modularized-Output.html).
+
+## JavaScript API
+
+### Supported Tokenizers
+- `HFTokenizer`
+- `SpTokenizer`
+- `Tiktoken`
+- `Llama2cTokenizer`
+
+### Tokenizer API
+- `load(data)`: Load tokenizer data from a file or a buffer.
+- `encode(text, bos=0, eos=0)`: Encode a string into a list of tokens with the number of bos tokens to prepend and eos tokens to append to the result.
+- `decode(tokens)`: Decode a list of tokens into a string.
+- `vocabSize`: The number of tokens in the vocabulary.
+- `eosTok`: The end-of-sequence token.
+- `bosTok`: The begining-of-sequence token.
+- `isLoaded`: Whether the tokenizer is loaded.
diff --git a/extension/wasm/tokenizers/tokenizers.cpp b/extension/wasm/tokenizers/tokenizers.cpp
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/sentencepiece.h>
+#include <pytorch/tokenizers/tekken.h>
+#include <pytorch/tokenizers/tiktoken.h>
+#include <cstdio>
+
+using namespace emscripten;
+using tokenizers::Error;
+using tokenizers::HFTokenizer;
+using tokenizers::Llama2cTokenizer;
+using tokenizers::SPTokenizer;
+using tokenizers::Tekken;
+using tokenizers::Tiktoken;
+using tokenizers::Tokenizer;
+
+#define THROW_JS_ERROR(errorType, message, ...)                           \
+  ({                                                                      \
+    char msg_buf[256];                                                    \
+    int len = snprintf(msg_buf, sizeof(msg_buf), message, ##__VA_ARGS__); \
+    if (len < sizeof(msg_buf)) {                                          \
+      EM_ASM(throw new errorType(UTF8ToString($0)), msg_buf);             \
+    } else {                                                              \
+      std::string msg;                                                    \
+      msg.resize(len);                                                    \
+      snprintf(&msg[0], len + 1, message, ##__VA_ARGS__);                 \
+      EM_ASM(throw new errorType(UTF8ToString($0)), msg.c_str());         \
+    }                                                                     \
+    __builtin_unreachable();                                              \
+  })
+
+/// Throws a JavaScript Error with the provided message if `error` is not `Ok`.
+#define THROW_IF_ERROR(error, message, ...)          \
+  ({                                                 \
+    if ET_UNLIKELY ((error) != Error::Ok) {          \
+      THROW_JS_ERROR(Error, message, ##__VA_ARGS__); \
+    }                                                \
+  })
+
+namespace executorch {
+namespace extension {
+namespace wasm {
+namespace tokenizers {
+
+namespace {
+
+#define JS_FORALL_TOKENIZERS(_) \
+  _(HFTokenizer)                \
+  _(Tiktoken)                   \
+  _(SPTokenizer)                \
+  _(Llama2cTokenizer)           \
+  _(Tekken)
+
+/**
+ * EXPERIMENTAL: JavaScript wrapper for Tokenizer.
+ */
+template <typename T>
+class ET_EXPERIMENTAL JsTokenizer {
+  static_assert(
+      std::is_base_of<Tokenizer, T>::value,
+      "T must be a subclass of Tokenizer");
+
+ public:
+  JsTokenizer() : tokenizer_(std::make_unique<T>()) {}
+  JsTokenizer(const JsTokenizer&) = delete;
+  JsTokenizer& operator=(const JsTokenizer&) = delete;
+  JsTokenizer(JsTokenizer&&) = default;
+  JsTokenizer& operator=(JsTokenizer&&) = default;
+
+  void load_from_uint8_array(val data) {
+    // Tokenizer API can't load from a buffer, so we need to write the buffer to
+    // a temporary file and load from there.
+    static const char* tmpFileName = "tokenizer_input_buffer.tmp";
+    FILE* tmp_file = fopen(tmpFileName, "wb");
+    if (tmp_file == nullptr) {
+      THROW_JS_ERROR(Error, "Failed to open file");
+    }
+    size_t length = data["length"].as<size_t>();
+    std::vector<uint8_t> buffer(length);
+    val memory_view = val(typed_memory_view(length, buffer.data()));
+    memory_view.call<void>("set", data);
+    fwrite(buffer.data(), sizeof(uint8_t), length, tmp_file);
+    fclose(tmp_file);
+    Error error = tokenizer_->load(tmpFileName);
+    THROW_IF_ERROR(error, "Failed to load tokenizer");
+    remove(tmpFileName);
+  }
+
+  void load(val data) {
+    if (data.isString()) {
+      Error error = tokenizer_->load(data.as<std::string>());
+      THROW_IF_ERROR(error, "Failed to load tokenizer");
+    } else if (data.instanceof (val::global("Uint8Array"))) {
+      return load_from_uint8_array(data);
+    } else if (data.instanceof (val::global("ArrayBuffer"))) {
+      return load_from_uint8_array(val::global("Uint8Array").new_(data));
+    } else {
+      THROW_JS_ERROR(
+          TypeError,
+          "Unsupported data type: %s",
+          data.typeOf().as<std::string>().c_str());
+    }
+  }
+
+  val encode(const std::string& text, int8_t bos, int8_t eos) const {
+    auto res = tokenizer_->encode(text, bos, eos);
+    THROW_IF_ERROR(res.error(), "Failed to encode text");
+    return val::array(res.get().begin(), res.get().end());
+  }
+
+  val encode(const std::string& text, int8_t bos) const {
+    return encode(text, bos, 0);
+  }
+
+  val encode(const std::string& text) const {
+    return encode(text, 0);
+  }
+
+  std::string decode(uint64_t prev, uint64_t current) const {
+    auto res = tokenizer_->decode(prev, current);
+    THROW_IF_ERROR(res.error(), "Failed to decode token");
+    return res.get();
+  }
+
+  uint64_t vocab_size() const {
+    return tokenizer_->vocab_size();
+  }
+
+  uint64_t bos_tok() const {
+    return tokenizer_->bos_tok();
+  }
+
+  uint64_t eos_tok() const {
+    return tokenizer_->eos_tok();
+  }
+
+  bool is_loaded() const {
+    return tokenizer_->is_loaded();
+  }
+
+ private:
+  std::unique_ptr<T> tokenizer_;
+};
+
+} // namespace
+
+EMSCRIPTEN_BINDINGS(TokenizerModule) {
+#define JS_BIND_TOKENIZER(NAME)                                           \
+  class_<JsTokenizer<NAME>>(#NAME)                                        \
+      .constructor<>()                                                    \
+      .function("load", &JsTokenizer<NAME>::load)                         \
+      .function(                                                          \
+          "encode",                                                       \
+          select_overload<val(const std::string&) const>(                 \
+              &JsTokenizer<NAME>::encode))                                \
+      .function(                                                          \
+          "encode",                                                       \
+          select_overload<val(const std::string&, int8_t) const>(         \
+              &JsTokenizer<NAME>::encode))                                \
+      .function(                                                          \
+          "encode",                                                       \
+          select_overload<val(const std::string&, int8_t, int8_t) const>( \
+              &JsTokenizer<NAME>::encode))                                \
+      .function("decode", &JsTokenizer<NAME>::decode)                     \
+      .property("vocabSize", &JsTokenizer<NAME>::vocab_size)              \
+      .property("bosTok", &JsTokenizer<NAME>::bos_tok)                    \
+      .property("eosTok", &JsTokenizer<NAME>::eos_tok)                    \
+      .property("isLoaded", &JsTokenizer<NAME>::is_loaded);
+  JS_FORALL_TOKENIZERS(JS_BIND_TOKENIZER)
+}
+
+} // namespace tokenizers
+} // namespace wasm
+} // namespace extension
+} // namespace executorch
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
@@ -155,6 +155,10 @@ define_overridable_option(
 define_overridable_option(
   EXECUTORCH_BUILD_WASM "Build the ExecuTorch JavaScript API" BOOL OFF
 )
+define_overridable_option(
+  EXECUTORCH_BUILD_TOKENIZERS_WASM "Build the JavaScript Tokenizers API" BOOL
+  OFF
+)
 
 if(EXECUTORCH_BUILD_ARM_BAREMETAL)
   set(_default_executorch_build_pthreadpool OFF)
@@ -333,6 +337,11 @@ check_required_options_on(
   EXECUTORCH_BUILD_EXTENSION_TENSOR
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_TOKENIZERS_WASM REQUIRES
+  EXECUTORCH_BUILD_EXTENSION_LLM
+)
+
 if(NOT EXISTS ${EXECUTORCH_PAL_DEFAULT_FILE_PATH})
   message(
     FATAL_ERROR