mlc-ai · CharlieFRuan · May 5, 2025 · May 4, 2025 · May 5, 2025
diff --git a/examples/qwen3/README.md b/examples/qwen3/README.md
@@ -0,0 +1,8 @@
+### OpenAI API Demos w/ Qwen3
+
+Run `npm install` first, followed by `npm start`.
+
+Note if you would like to hack WebLLM core package,
+you can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/qwen3/package.json b/examples/qwen3/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "qwen3_example",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/qwen3_example.html  --port 8883",
+    "build": "parcel build src/qwen3_example.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.78"
+  }
+}
diff --git a/examples/qwen3/src/qwen3_example.html b/examples/qwen3/src/qwen3_example.html
@@ -0,0 +1,17 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+
+  <body>
+    <h2>WebLLM Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <script type="module" src="./qwen3_example.ts"></script>
+  </body>
+</html>
diff --git a/examples/qwen3/src/qwen3_example.ts b/examples/qwen3/src/qwen3_example.ts
@@ -0,0 +1,147 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+// Helper method to stream responses from the engine
+async function streamResponse(
+  engine: webllm.MLCEngineInterface,
+  request: webllm.ChatCompletionRequestStreaming,
+): Promise<void> {
+  console.log("Requesting chat completion with request:", request);
+  const asyncChunkGenerator = await engine.chat.completions.create(request);
+  let message = "";
+  for await (const chunk of asyncChunkGenerator) {
+    message += chunk.choices[0]?.delta?.content || "";
+    setLabel("generate-label", message);
+    if (chunk.usage) {
+      console.log(chunk.usage); // only last chunk has usage
+    }
+    // engine.interruptGenerate();  // works with interrupt as well
+  }
+  console.log("Final message:\n", await engine.getMessage()); // the concatenated message
+}
+
+/**
+ * We demonstrate how Qwen3's best practices can be followed in WebLLM. For more, see
+ * https://huggingface.co/Qwen/Qwen3-8B#best-practices.
+ */
+async function main() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+  const selectedModel = "Qwen3-4B-q4f16_1-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    { initProgressCallback: initProgressCallback },
+  );
+
+  /**
+   * 1. Default behavior: enable thinking
+   */
+  let request: webllm.ChatCompletionRequest = {
+    stream: true,
+    stream_options: { include_usage: true },
+    messages: [
+      {
+        role: "user",
+        content: "How many r's are there in the word strawberry?",
+      },
+    ],
+    // Specifying `enable_thinking` is optional, as it defaults to think.
+    // extra_body: {
+    //   enable_thinking: true,
+    // }
+  };
+  await streamResponse(engine, request);
+
+  /**
+   * 2. Disable thinking with `enable_thinking: false`.
+   */
+  request = {
+    stream: true,
+    stream_options: { include_usage: true },
+    messages: [
+      {
+        role: "user",
+        content: "How many r's are there in the word strawberry?",
+      },
+    ],
+    extra_body: {
+      enable_thinking: false,
+    },
+  };
+  await streamResponse(engine, request);
+
+  /**
+   * 3. Disable thinking with soft switch /no_think
+   * or enable thinking with soft switch /think.
+   * Using soft switch: "When enable_thinking=True, regardless of whether the user
+   * uses /think or /no_think, the model will always output a block wrapped in
+   * <think>...</think>. However, the content inside this block may be empty if
+   * thinking is disabled. When enable_thinking=False, the soft switches are not
+   * valid. Regardless of any /think or /no_think tags input by the user, the
+   * model will not generate think content and will not include a <think>...</think> block.
+   */
+  request = {
+    stream: true,
+    stream_options: { include_usage: true },
+    messages: [
+      {
+        role: "user",
+        content: "How many r's are there in the word strawberry? /no_think",
+        // content: "How many r's are there in the word strawberry? /think",
+      },
+    ],
+  };
+  await streamResponse(engine, request);
+
+  /**
+   * 4. For multi-turn messages, it is recommended to
+   * parse out the thinking content in the history
+   * messages as described in the Best Practices section.
+   */
+  const history: webllm.ChatCompletionMessageParam[] = [
+    {
+      role: "user",
+      content: "How many r's are there in the word strawberry? /think",
+    },
+    {
+      role: "assistant",
+      content:
+        "<think>Dummy thinking content here...</think>\n\nThe answer is 3.",
+    },
+  ];
+  // Preprocess history to remove thinking content
+  const preprocessedHistory = history.map((msg) => {
+    if (msg.role === "assistant") {
+      // Remove <think>...</think> block from assistant messages that is at the start
+      // and may contain two \n\n line breaks.
+      const thinkRegex = /<think>.*?<\/think>\n?\n?/s; // Match <think>...</think> with optional \n\n
+      const contentWithoutThink = msg.content!.replace(thinkRegex, "").trim();
+      return { ...msg, content: contentWithoutThink };
+    }
+    return msg; // User messages remain unchanged
+  });
+  console.log("Preprocessed history:", preprocessedHistory);
+
+  // Now use the preprocessed history in the request
+  const newMessage: webllm.ChatCompletionMessageParam = {
+    role: "user",
+    content: "What about blueberries?",
+  };
+
+  request = {
+    stream: true,
+    stream_options: { include_usage: true },
+    messages: [...preprocessedHistory, newMessage],
+  };
+  await streamResponse(engine, request);
+}
+
+main();
diff --git a/examples/simple-chat-ts/src/simple_chat.ts b/examples/simple-chat-ts/src/simple_chat.ts
@@ -303,6 +303,12 @@ class ChatUI {
         stream: true,
         messages: this.chatHistory,
         stream_options: { include_usage: true },
+        // if model starts with "Qwen3", disable thinking.
+        extra_body: this.selectedModel.startsWith("Qwen3")
+          ? {
+              enable_thinking: false,
+            }
+          : undefined,
       });
       // TODO(Charlie): Processing of � requires changes
       for await (const chunk of completion) {