Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions llama.cpp.patches/patches/main_main.cpp.patch
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@

static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
(void) level;
@@ -128,7 +146,91 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
@@ -128,7 +146,94 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
return formatted;
}

Expand Down Expand Up @@ -164,6 +164,10 @@
+ __builtin_unreachable();
+ }
+
+ // Load .args file BEFORE determining program type
+ // so that flags like --server --v2 in .args are seen
+ argc = cosmo_args("/zip/.args", &argv);
+
+ enum Program prog = determine_program(argv);
+ if (prog == LLAMAFILER)
+ return lf::server::main(argc, argv);
Expand All @@ -172,7 +176,6 @@
+ mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
+ mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
+ ShowCrashReports();
+ argc = cosmo_args("/zip/.args", &argv);
+
+ if (prog == SERVER)
+ return server_cli(argc, argv);
Expand Down
36 changes: 30 additions & 6 deletions llamafile/flags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,15 +313,39 @@ void llamafile_get_flags(int argc, char **argv) {
if (!strcmp(flag, "--url-prefix")) {
if (i == argc)
missing("--url-prefix");
FLAG_url_prefix = argv[i++];
if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
exit(1);

std::string url_prefix = argv[i++];

// Consolidate consecutive slashes
size_t pos = 0;
while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
url_prefix.replace(pos, 2, "/");
}
if (endswith(FLAG_url_prefix, "/")) {
tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);

// Ensure single slash at start
if (url_prefix.empty() || url_prefix[0] != '/') {
url_prefix = "/" + url_prefix;
}

// Remove trailing slash if present
if (url_prefix.length() > 1 && url_prefix.back() == '/') {
url_prefix.pop_back();
}

// If only a single slash remains, convert to empty string
if (url_prefix == "/") {
url_prefix = "";
}

// Validate the normalized path
if (!url_prefix.empty() && !IsAcceptablePath(url_prefix.c_str(), url_prefix.length())) {
tinyprint(2, "error: --url-prefix must not have /. or /./ or /../ after normalization\n", NULL);
exit(1);
}

// Store in static storage (persists for program lifetime)
static std::string stored_prefix = url_prefix;
FLAG_url_prefix = stored_prefix.c_str();
continue;
}

Expand Down
32 changes: 25 additions & 7 deletions llamafile/server/client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -522,12 +522,30 @@ Client::send_response_finish()
bool
Client::send_binary(const void* p, size_t n)
{
ssize_t sent;
if ((sent = write(fd_, p, n)) != n) {
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
SLOG("write failed %m");
close_connection_ = true;
return false;
size_t total_sent = 0;
const char* ptr = (const char*)p;

while (total_sent < n) {
ssize_t sent = write(fd_, ptr + total_sent, n - total_sent);

if (sent > 0) {
total_sent += sent;
} else if (sent == 0) {
// Connection closed
close_connection_ = true;
return false;
} else {
// Error occurred
if (errno == EINTR) {
// Interrupted by signal, retry
continue;
}
if (errno != EAGAIN && errno != ECONNRESET) {
SLOG("write failed %m");
}
close_connection_ = true;
return false;
}
}
return true;
}
Expand Down Expand Up @@ -775,7 +793,7 @@ Client::dispatcher()
should_send_error_if_canceled_ = false;
if (!send(std::string_view(obuf_.p, p - obuf_.p)))
return false;
char buf[512];
char buf[16384];
size_t i, chunk;
for (i = 0; i < size; i += chunk) {
chunk = size - i;
Expand Down
10 changes: 3 additions & 7 deletions llamafile/server/worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,9 @@ Worker::begin()
tokens = tokenbucket_acquire(client_.client_ip_);
server_->lock();
dll_remove(&server_->idle_workers, &elem_);
if (dll_is_empty(server_->idle_workers)) {
Dll* slowbro;
if ((slowbro = dll_last(server_->active_workers))) {
SLOG("all threads active! dropping oldest client");
WORKER(slowbro)->kill();
}
}
// Remove aggressive client cancellation - let TCP backlog handle overflow
// The kernel's listen backlog will naturally queue incoming connections
// until a worker becomes available, providing better user experience
working_ = true;
if (tokens > FLAG_token_burst) {
dll_make_last(&server_->active_workers, &elem_);
Expand Down