ikawrakow · ikawrakow · Oct 27, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,16 @@ poetry.toml
 
 # Scripts
 !/scripts/install-oneapi.bat
+/examples/server/webui_llamacpp/.gitignore
+
+# Test models for lora adapters
+/lora-tests
+
+# Local scripts
+/run-vim.sh
+/run-chat.sh
+.ccache/
+
+# IDE
+*.code-workspace
+.windsurf/
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
@@ -3,9 +3,12 @@
 #include "log.h"
 #include "regex-partial.h"
 
+#include <algorithm>
+#include <cctype>
 #include <optional>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <vector>
 
 using json = nlohmann::ordered_json;
@@ -137,6 +140,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
 }
 
 bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
+    std::string pending_reasoning_prefix;
+
+    if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
+        return false;
+    }
+
+    auto set_reasoning_prefix = [&](size_t prefix_pos) {
+        if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
+            return;
+        }
+        if (prefix_pos + start_think.size() > input_.size()) {
+            pending_reasoning_prefix.clear();
+            return;
+        }
+        // Capture the exact literal that opened the reasoning section so we can
+        // surface it back to callers. This ensures formats that force the
+        // reasoning tag open (e.g. DeepSeek R1) retain their original prefix
+        // instead of dropping it during parsing.
+        pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
+    };
+
     auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
         auto stripped_reasoning = string_strip(reasoning);
         if (stripped_reasoning.empty()) {
@@ -149,28 +173,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
                 add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
             }
         } else {
+            if (!pending_reasoning_prefix.empty()) {
+                add_reasoning_content(pending_reasoning_prefix);
+                pending_reasoning_prefix.clear();
+            }
             add_reasoning_content(stripped_reasoning);
         }
     };
-    if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
-            if (auto res = try_find_literal(end_think)) {
-                handle_reasoning(res->prelude, /* closed */ true);
-                consume_spaces();
-                return true;
-            }
-            auto rest = consume_rest();
+
+    const size_t saved_pos = pos_;
+    const size_t saved_content_size = result_.content.size();
+    const size_t saved_reasoning_size = result_.reasoning_content.size();
+
+    auto restore_state = [&]() {
+        move_to(saved_pos);
+        result_.content.resize(saved_content_size);
+        result_.reasoning_content.resize(saved_reasoning_size);
+    };
+
+    // Allow leading whitespace to be preserved as content when reasoning is present at the start
+    size_t cursor = pos_;
+    size_t whitespace_end = cursor;
+    while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
+        ++whitespace_end;
+    }
+
+    if (whitespace_end >= input_.size()) {
+        restore_state();
+        if (syntax_.thinking_forced_open) {
+            auto rest = input_.substr(saved_pos);
             if (!rest.empty()) {
                 handle_reasoning(rest, /* closed */ !is_partial());
             }
-            // Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
-            // if (!syntax_.thinking_forced_open) {
-            //     throw common_chat_msg_partial_exception(end_think);
-            // }
+            move_to(input_.size());
             return true;
         }
+        return false;
+    }
+
+    cursor = whitespace_end;
+    const size_t remaining = input_.size() - cursor;
+    const size_t start_prefix = std::min(start_think.size(), remaining);
+    const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;
+
+    if (has_start_tag && start_prefix < start_think.size()) {
+        move_to(input_.size());
+        return true;
+    }
+
+    if (has_start_tag) {
+        if (whitespace_end > pos_) {
+            add_content(input_.substr(pos_, whitespace_end - pos_));
+        }
+        set_reasoning_prefix(cursor);
+        cursor += start_think.size();
+    } else if (syntax_.thinking_forced_open) {
+        cursor = whitespace_end;
+    } else {
+        restore_state();
+        return false;
+    }
+    while (true) {
+        if (cursor >= input_.size()) {
+            move_to(input_.size());
+            return true;
+        }
+
+        size_t end_pos = input_.find(end_think, cursor);
+        if (end_pos == std::string::npos) {
+            std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
+            size_t partial_off = string_find_partial_stop(remaining_view, end_think);
+            size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
+            if (reasoning_end > cursor) {
+                handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
+            }
+            move_to(input_.size());
+            return true;
+        }
+
+        if (end_pos > cursor) {
+            handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
+        } else {
+            handle_reasoning("", /* closed */ true);
+        }
+
+        cursor = end_pos + end_think.size();
+
+        while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
+            ++cursor;
+        }
+
+        const size_t next_remaining = input_.size() - cursor;
+        if (next_remaining == 0) {
+            move_to(cursor);
+            return true;
+        }
+
+        const size_t next_prefix = std::min(start_think.size(), next_remaining);
+        if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
+            if (next_prefix < start_think.size()) {
+                move_to(input_.size());
+                return true;
+            }
+            set_reasoning_prefix(cursor);
+            cursor += start_think.size();
+            continue;
+        }
+
+        move_to(cursor);
+        return true;
     }
-    return false;
 }
 
 std::string common_chat_msg_parser::consume_rest() {

diff --git a/common/chat.cpp b/common/chat.cpp
@@ -1207,6 +1207,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
     return data;
 }
 static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
+    builder.try_parse_reasoning("<think>", "</think>");
+
     if (!builder.syntax().parse_tool_calls) {
         builder.add_content(builder.consume_rest());
         return;
@@ -2411,6 +2413,7 @@ common_chat_params common_chat_templates_apply(
 }
 
 static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
     builder.add_content(builder.consume_rest());
 }
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -200,6 +200,20 @@ int32_t cpu_get_num_math() {
     return cpu_get_num_physical_cores();
 }
 
+common_webui common_webui_from_name(const std::string& format) {
+    if (format == "none") {
+        return COMMON_WEBUI_NONE;
+    }
+    else if (format == "auto") {
+        return COMMON_WEBUI_AUTO;
+    }
+    else if (format == "llamacpp") {
+        return COMMON_WEBUI_LLAMACPP;
+    }
+    else {
+        return COMMON_WEBUI_AUTO;
+    }
+}
 
 static std::string read_file(const std::string& fname) {
     std::ifstream file(fname);
@@ -1417,6 +1431,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.public_path = argv[i];
         return true;
     }
+    if (arg == "--webui") {
+        CHECK_ARG
+        params.webui = common_webui_from_name(std::string(argv[i]));
+        return true;
+    }
     if (arg == "--api-key") {
         CHECK_ARG
         params.api_keys.push_back(argv[i]);
@@ -1888,6 +1907,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                  "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
                         "- none: leaves thoughts unparsed in `message.content`\n"
                         "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+                        "- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
                         "(default: none)", });
     options.push_back({ "main",      "       --chat-template-kwargs JSON",  "sets additional params for the json template parser"});
     options.push_back({ "main",      "       --reasoning-budget N",  "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" });
@@ -2046,6 +2066,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
     options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
     options.push_back({ "server",      "       --embedding(s)",         "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",        "       --webui NAME",
+                                                             "controls which webui to server:\n"
+                                                            "- none: disable webui\n"
+                                                            "- auto: default webui \n"
+                                                            "- llamacpp: llamacpp webui \n"
+                                                            "(default: auto)", });
     options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
     options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });

diff --git a/common/common.h b/common/common.h
@@ -109,6 +109,14 @@ enum common_reasoning_format {
     COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
+enum common_webui {
+    COMMON_WEBUI_NONE,
+    COMMON_WEBUI_AUTO,
+    COMMON_WEBUI_LLAMACPP,  
+};
+
+common_webui common_webui_from_name(const std::string& format);
+
 struct model_paths {
     std::string path        = ""; // model local path                                       // NOLINT
     std::string url         = ""; // model url to download                                  // NOLINT
@@ -288,7 +296,7 @@ struct gpt_params {
     bool use_jinja = false;                                                                                 // NOLINT
     std::string system_prompt = "";
     bool enable_chat_template = true;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int reasoning_budget = -1;
     bool prefill_assistant = true;
 
@@ -300,8 +308,8 @@ struct gpt_params {
     std::map<std::string, std::string> default_template_kwargs;
 
     // "advanced" endpoints are disabled by default for better security
-    bool webui            = true;
-    bool endpoint_slots   = false;
+    common_webui webui = COMMON_WEBUI_AUTO;
+    bool endpoint_slots   = true;
     bool endpoint_props   = false; // only control POST requests, not GET
     bool endpoint_metrics = false;
 

diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
@@ -17,7 +17,7 @@ set(TARGET_SRCS
 )
 set(PUBLIC_ASSETS
     index.html.gz
-    loading.html
+
 )
 
 foreach(asset ${PUBLIC_ASSETS})
@@ -29,10 +29,32 @@ foreach(asset ${PUBLIC_ASSETS})
         OUTPUT "${output}"
         COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
     )
+	message("TARGET_SRCS contains: ${input}")
 	set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
 
 endforeach()
 
+# include new llamacpp webui
+set(ALT_PUBLIC_ASSETS
+    index_llamacpp.html.gz
+	loading.html
+)
+
+foreach(asset ${ALT_PUBLIC_ASSETS})
+    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public_llamacpp/${asset}")
+    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
+    list(APPEND TARGET_SRCS ${output})
+    add_custom_command(
+        DEPENDS "${input}"
+        OUTPUT "${output}"
+        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
+    )
+	message("TARGET_SRCS contains: ${input}")
+	set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
+
+endforeach()
+
+
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE