mozilla-ai · anivar · Dec 4, 2025
diff --git a/llama.cpp.patches/patches/main_main.cpp.patch b/llama.cpp.patches/patches/main_main.cpp.patch
@@ -107,7 +107,7 @@
 
  static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
      (void) level;
-@@ -128,7 +146,91 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
+@@ -128,7 +146,94 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
      return formatted;
  }
 
@@ -164,6 +164,10 @@
 +        __builtin_unreachable();
 +    }
 +
++    // Load .args file BEFORE determining program type
++    // so that flags like --server --v2 in .args are seen
++    argc = cosmo_args("/zip/.args", &argv);
++
 +    enum Program prog = determine_program(argv);
 +    if (prog == LLAMAFILER)
 +        return lf::server::main(argc, argv);
@@ -172,7 +176,6 @@
 +    mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
 +    mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
 +    ShowCrashReports();
-+    argc = cosmo_args("/zip/.args", &argv);
 +
 +    if (prog == SERVER)
 +        return server_cli(argc, argv);

diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -313,15 +313,39 @@ void llamafile_get_flags(int argc, char **argv) {
         if (!strcmp(flag, "--url-prefix")) {
             if (i == argc)
                 missing("--url-prefix");
-            FLAG_url_prefix = argv[i++];
-            if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
-                tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
-                exit(1);
+
+            std::string url_prefix = argv[i++];
+
+            // Consolidate consecutive slashes
+            size_t pos = 0;
+            while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
+                url_prefix.replace(pos, 2, "/");
             }
-            if (endswith(FLAG_url_prefix, "/")) {
-                tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
+
+            // Ensure single slash at start
+            if (url_prefix.empty() || url_prefix[0] != '/') {
+                url_prefix = "/" + url_prefix;
+            }
+
+            // Remove trailing slash if present
+            if (url_prefix.length() > 1 && url_prefix.back() == '/') {
+                url_prefix.pop_back();
+            }
+
+            // If only a single slash remains, convert to empty string
+            if (url_prefix == "/") {
+                url_prefix = "";
+            }
+
+            // Validate the normalized path
+            if (!url_prefix.empty() && !IsAcceptablePath(url_prefix.c_str(), url_prefix.length())) {
+                tinyprint(2, "error: --url-prefix must not have /. or /./ or /../ after normalization\n", NULL);
                 exit(1);
             }
+
+            // Store in static storage (persists for program lifetime)
+            static std::string stored_prefix = url_prefix;
+            FLAG_url_prefix = stored_prefix.c_str();
             continue;
         }
 

diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -522,12 +522,30 @@ Client::send_response_finish()
 bool
 Client::send_binary(const void* p, size_t n)
 {
-    ssize_t sent;
-    if ((sent = write(fd_, p, n)) != n) {
-        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
-            SLOG("write failed %m");
-        close_connection_ = true;
-        return false;
+    size_t total_sent = 0;
+    const char* ptr = (const char*)p;
+
+    while (total_sent < n) {
+        ssize_t sent = write(fd_, ptr + total_sent, n - total_sent);
+
+        if (sent > 0) {
+            total_sent += sent;
+        } else if (sent == 0) {
+            // Connection closed
+            close_connection_ = true;
+            return false;
+        } else {
+            // Error occurred
+            if (errno == EINTR) {
+                // Interrupted by signal, retry
+                continue;
+            }
+            if (errno != EAGAIN && errno != ECONNRESET) {
+                SLOG("write failed %m");
+            }
+            close_connection_ = true;
+            return false;
+        }
     }
     return true;
 }
@@ -775,7 +793,7 @@ Client::dispatcher()
     should_send_error_if_canceled_ = false;
     if (!send(std::string_view(obuf_.p, p - obuf_.p)))
         return false;
-    char buf[512];
+    char buf[16384];
     size_t i, chunk;
     for (i = 0; i < size; i += chunk) {
         chunk = size - i;

diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp
@@ -56,13 +56,9 @@ Worker::begin()
         tokens = tokenbucket_acquire(client_.client_ip_);
     server_->lock();
     dll_remove(&server_->idle_workers, &elem_);
-    if (dll_is_empty(server_->idle_workers)) {
-        Dll* slowbro;
-        if ((slowbro = dll_last(server_->active_workers))) {
-            SLOG("all threads active! dropping oldest client");
-            WORKER(slowbro)->kill();
-        }
-    }
+    // Remove aggressive client cancellation - let TCP backlog handle overflow
+    // The kernel's listen backlog will naturally queue incoming connections
+    // until a worker becomes available, providing better user experience
     working_ = true;
     if (tokens > FLAG_token_burst) {
         dll_make_last(&server_->active_workers, &elem_);