Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
1f34a3d
Add new webui from llama.cpp
Sep 18, 2025
fef7ee7
Add new webui
Sep 18, 2025
2fa124c
feat: Improve mobile UI for Settings Dialog (#16084)
allozaur Sep 19, 2025
129106e
webui : fix handling incomplete chunks (#16107)
Bramas Sep 22, 2025
4570905
Always show message actions for mobile UI + improvements for user mes…
allozaur Sep 26, 2025
790f2ed
webui: switch to hash-based routing (alternative of #16079) (#16157)
isaac-mcfadyen Sep 26, 2025
8164bdb
Allow viewing conversations even when llama server is down (#16255)
allozaur Sep 26, 2025
95c7ea9
Enhance text file detection logic for file attachments (#16199)
allozaur Sep 26, 2025
c0b391b
Show message actions by default (#16289)
allozaur Sep 27, 2025
c8b0b33
fix: preserved zero values in chat settings inputs and textareas by s…
ServeurpersoCom Sep 29, 2025
45436ce
Improve Mobile UI for dialogs and action dropdowns (#16222)
allozaur Sep 29, 2025
7fb2c90
Fix thinking blocks with quotes + add handling `[THINK]...[/THINK]` b…
ServeurpersoCom Sep 29, 2025
cb7da5d
Chatapi ignore empty sampling (#16330)
ServeurpersoCom Sep 30, 2025
8d078b7
webui: Remove running `llama-server` within WebUI `dev.sh` script (#1…
allozaur Oct 1, 2025
10eb1b2
Add optional setting for showing "Model used:" information (#16337)
allozaur Oct 1, 2025
bc28333
Improve code block color theming (#16325)
allozaur Oct 1, 2025
b6a2f3b
Conversation action dialogs as singletons from Chat Sidebar + apply c…
allozaur Oct 1, 2025
08124e4
fix: track viewportHeight via window.innerHeight to avoid unwanted sc…
ServeurpersoCom Oct 3, 2025
69382e8
webui : Fix messages payload sent to chat completions (#16402)
allozaur Oct 3, 2025
e98af0e
Capture model name only after first token (streaming) or completed re…
allozaur Oct 3, 2025
9d2d950
Fix missing messages on sibling navigation (#16408)
allozaur Oct 3, 2025
d9c2652
webui : added download action (#13552) (#16282)
srogmann Oct 7, 2025
dd4b428
refactor: centralize CoT parsing in backend for streaming mode (#16394)
ServeurpersoCom Oct 8, 2025
3bc9b1c
No markdown in cot (#16483)
ServeurpersoCom Oct 9, 2025
5b7ea5d
webui: updated the chat service to only include max_tokens in the req…
ServeurpersoCom Oct 9, 2025
e98d9db
feat: render user content as markdown option (#16358)
ServeurpersoCom Oct 11, 2025
4cfb56d
webui: remove client-side context pre-check and rely on backend for l…
ServeurpersoCom Oct 12, 2025
18863dd
fix: add remark plugin to render raw HTML as literal text (#16505)
ServeurpersoCom Oct 13, 2025
79376ac
Add server-driven parameter defaults and syncing (#16515)
allozaur Oct 15, 2025
6989f76
fix: added a normalization step for MathJax-style \[\] and \(\) delim…
ServeurpersoCom Oct 16, 2025
eabdc08
webui: reorganize settings layout (#16607)
ServeurpersoCom Oct 17, 2025
e3e1e06
Enable per-conversation loading states to allow having parallel conve…
allozaur Oct 20, 2025
a786701
Import/Export UX improvements (#16619)
allozaur Oct 20, 2025
f1fd58c
Prevent premature submission on IME input (#16673)
allozaur Oct 20, 2025
6061dae
Handle legacy 'context' attachments (#16687)
allozaur Oct 20, 2025
8b15ffb
webui: introduce OpenAI-compatible model selector in JSON payload (#1…
ServeurpersoCom Oct 22, 2025
fe930b7
webui: support q URL parameter (#16728)
odrling Oct 24, 2025
3384792
build fix
Oct 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,16 @@ poetry.toml

# Scripts
!/scripts/install-oneapi.bat
/examples/server/webui_llamacpp/.gitignore

# Test models for lora adapters
/lora-tests

# Local scripts
/run-vim.sh
/run-chat.sh
.ccache/

# IDE
*.code-workspace
.windsurf/
138 changes: 125 additions & 13 deletions common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
#include "log.h"
#include "regex-partial.h"

#include <algorithm>
#include <cctype>
#include <optional>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>

using json = nlohmann::ordered_json;
Expand Down Expand Up @@ -137,6 +140,27 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
}

bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
std::string pending_reasoning_prefix;

if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
return false;
}

auto set_reasoning_prefix = [&](size_t prefix_pos) {
if (!syntax_.thinking_forced_open || syntax_.reasoning_in_content) {
return;
}
if (prefix_pos + start_think.size() > input_.size()) {
pending_reasoning_prefix.clear();
return;
}
// Capture the exact literal that opened the reasoning section so we can
// surface it back to callers. This ensures formats that force the
// reasoning tag open (e.g. DeepSeek R1) retain their original prefix
// instead of dropping it during parsing.
pending_reasoning_prefix = input_.substr(prefix_pos, start_think.size());
};

auto handle_reasoning = [&](const std::string & reasoning, bool closed) {
auto stripped_reasoning = string_strip(reasoning);
if (stripped_reasoning.empty()) {
Expand All @@ -149,28 +173,116 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
add_content(syntax_.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "</think>" : end_think);
}
} else {
if (!pending_reasoning_prefix.empty()) {
add_reasoning_content(pending_reasoning_prefix);
pending_reasoning_prefix.clear();
}
add_reasoning_content(stripped_reasoning);
}
};
if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
if (auto res = try_find_literal(end_think)) {
handle_reasoning(res->prelude, /* closed */ true);
consume_spaces();
return true;
}
auto rest = consume_rest();

const size_t saved_pos = pos_;
const size_t saved_content_size = result_.content.size();
const size_t saved_reasoning_size = result_.reasoning_content.size();

auto restore_state = [&]() {
move_to(saved_pos);
result_.content.resize(saved_content_size);
result_.reasoning_content.resize(saved_reasoning_size);
};

// Allow leading whitespace to be preserved as content when reasoning is present at the start
size_t cursor = pos_;
size_t whitespace_end = cursor;
while (whitespace_end < input_.size() && std::isspace(static_cast<unsigned char>(input_[whitespace_end]))) {
++whitespace_end;
}

if (whitespace_end >= input_.size()) {
restore_state();
if (syntax_.thinking_forced_open) {
auto rest = input_.substr(saved_pos);
if (!rest.empty()) {
handle_reasoning(rest, /* closed */ !is_partial());
}
// Allow unclosed thinking tags, for now (https://github.com/ggml-org/llama.cpp/issues/13812, https://github.com/ggml-org/llama.cpp/issues/13877)
// if (!syntax_.thinking_forced_open) {
// throw common_chat_msg_partial_exception(end_think);
// }
move_to(input_.size());
return true;
}
return false;
}

cursor = whitespace_end;
const size_t remaining = input_.size() - cursor;
const size_t start_prefix = std::min(start_think.size(), remaining);
const bool has_start_tag = input_.compare(cursor, start_prefix, start_think, 0, start_prefix) == 0;

if (has_start_tag && start_prefix < start_think.size()) {
move_to(input_.size());
return true;
}

if (has_start_tag) {
if (whitespace_end > pos_) {
add_content(input_.substr(pos_, whitespace_end - pos_));
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
} else if (syntax_.thinking_forced_open) {
cursor = whitespace_end;
} else {
restore_state();
return false;
}
while (true) {
if (cursor >= input_.size()) {
move_to(input_.size());
return true;
}

size_t end_pos = input_.find(end_think, cursor);
if (end_pos == std::string::npos) {
std::string_view remaining_view(input_.data() + cursor, input_.size() - cursor);
size_t partial_off = string_find_partial_stop(remaining_view, end_think);
size_t reasoning_end = partial_off == std::string::npos ? input_.size() : cursor + partial_off;
if (reasoning_end > cursor) {
handle_reasoning(input_.substr(cursor, reasoning_end - cursor), /* closed */ partial_off == std::string::npos && !is_partial());
}
move_to(input_.size());
return true;
}

if (end_pos > cursor) {
handle_reasoning(input_.substr(cursor, end_pos - cursor), /* closed */ true);
} else {
handle_reasoning("", /* closed */ true);
}

cursor = end_pos + end_think.size();

while (cursor < input_.size() && std::isspace(static_cast<unsigned char>(input_[cursor]))) {
++cursor;
}

const size_t next_remaining = input_.size() - cursor;
if (next_remaining == 0) {
move_to(cursor);
return true;
}

const size_t next_prefix = std::min(start_think.size(), next_remaining);
if (input_.compare(cursor, next_prefix, start_think, 0, next_prefix) == 0) {
if (next_prefix < start_think.size()) {
move_to(input_.size());
return true;
}
set_reasoning_prefix(cursor);
cursor += start_think.size();
continue;
}

move_to(cursor);
return true;
}
return false;
}

std::string common_chat_msg_parser::consume_rest() {
Expand Down
3 changes: 3 additions & 0 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,8 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
builder.try_parse_reasoning("<think>", "</think>");

if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
Expand Down Expand Up @@ -2411,6 +2413,7 @@ common_chat_params common_chat_templates_apply(
}

static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
builder.add_content(builder.consume_rest());
}

Expand Down
26 changes: 26 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,20 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores();
}

common_webui common_webui_from_name(const std::string& format) {
if (format == "none") {
return COMMON_WEBUI_NONE;
}
else if (format == "auto") {
return COMMON_WEBUI_AUTO;
}
else if (format == "llamacpp") {
return COMMON_WEBUI_LLAMACPP;
}
else {
return COMMON_WEBUI_AUTO;
}
}

static std::string read_file(const std::string& fname) {
std::ifstream file(fname);
Expand Down Expand Up @@ -1417,6 +1431,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.public_path = argv[i];
return true;
}
if (arg == "--webui") {
CHECK_ARG
params.webui = common_webui_from_name(std::string(argv[i]));
return true;
}
if (arg == "--api-key") {
CHECK_ARG
params.api_keys.push_back(argv[i]);
Expand Down Expand Up @@ -1888,6 +1907,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
"(default: none)", });
options.push_back({ "main", " --chat-template-kwargs JSON", "sets additional params for the json template parser"});
options.push_back({ "main", " --reasoning-budget N", "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)" });
Expand Down Expand Up @@ -2046,6 +2066,12 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
options.push_back({ "server", " --webui NAME",
"controls which webui to server:\n"
"- none: disable webui\n"
"- auto: default webui \n"
"- llamacpp: llamacpp webui \n"
"(default: auto)", });
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
Expand Down
14 changes: 11 additions & 3 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ enum common_reasoning_format {
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
};

enum common_webui {
COMMON_WEBUI_NONE,
COMMON_WEBUI_AUTO,
COMMON_WEBUI_LLAMACPP,
};

common_webui common_webui_from_name(const std::string& format);

struct model_paths {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
Expand Down Expand Up @@ -288,7 +296,7 @@ struct gpt_params {
bool use_jinja = false; // NOLINT
std::string system_prompt = "";
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int reasoning_budget = -1;
bool prefill_assistant = true;

Expand All @@ -300,8 +308,8 @@ struct gpt_params {
std::map<std::string, std::string> default_template_kwargs;

// "advanced" endpoints are disabled by default for better security
bool webui = true;
bool endpoint_slots = false;
common_webui webui = COMMON_WEBUI_AUTO;
bool endpoint_slots = true;
bool endpoint_props = false; // only control POST requests, not GET
bool endpoint_metrics = false;

Expand Down
24 changes: 23 additions & 1 deletion examples/server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ set(TARGET_SRCS
)
set(PUBLIC_ASSETS
index.html.gz
loading.html

)

foreach(asset ${PUBLIC_ASSETS})
Expand All @@ -29,10 +29,32 @@ foreach(asset ${PUBLIC_ASSETS})
OUTPUT "${output}"
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
)
message("TARGET_SRCS contains: ${input}")
set_source_files_properties(${output} PROPERTIES GENERATED TRUE)

endforeach()

# include new llamacpp webui
set(ALT_PUBLIC_ASSETS
index_llamacpp.html.gz
loading.html
)

foreach(asset ${ALT_PUBLIC_ASSETS})
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public_llamacpp/${asset}")
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
list(APPEND TARGET_SRCS ${output})
add_custom_command(
DEPENDS "${input}"
OUTPUT "${output}"
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
)
message("TARGET_SRCS contains: ${input}")
set_source_files_properties(${output} PROPERTIES GENERATED TRUE)

endforeach()


add_executable(${TARGET} ${TARGET_SRCS})
install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
Expand Down
Loading