From 67bc8767ca4f4acf174c964ea98f300bb71a5707 Mon Sep 17 00:00:00 2001
From: Jaeic Lee <2484055+jaeiclee@users.noreply.github.com>
Date: Wed, 18 Mar 2026 04:20:51 +0900
Subject: [PATCH] Add auto-unload timeout for idle models (env var only)

Implements automatic unloading of models after a configurable idle period via "LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER" env variable.

- Uses condition-variable-based timers for non-blocking cancellation, preventing deadlocks between timer threads and the model loading mutex.
- Timers are cancelled on all eviction paths (LRU, NPU exclusivity, nuclear option) to prevent orphaned timers.
- Added the new env variable in the documentation.
---
 docs/server/lemonade-server-cli.md     |   1 +
 src/cpp/include/lemon/cli_parser.h     |   3 +
 src/cpp/include/lemon/router.h         |  46 +++-
 src/cpp/include/lemon/server.h         |   3 +-
 src/cpp/include/lemon/wrapped_server.h |  10 +
 src/cpp/server/cli_parser.cpp          |  13 +
 src/cpp/server/main.cpp                |   2 +-
 src/cpp/server/router.cpp              | 316 +++++++++++++++++++++++--
 src/cpp/server/server.cpp              |  17 +-
 9 files changed, 382 insertions(+), 29 deletions(-)
diff --git a/docs/server/lemonade-server-cli.md b/docs/server/lemonade-server-cli.md
index d9c314533..3af574583 100644
--- a/docs/server/lemonade-server-cli.md
+++ b/docs/server/lemonade-server-cli.md
@@ -93,6 +93,7 @@ These settings can also be provided via environment variables that Lemonade Serv
 | `LEMONADE_DISABLE_MODEL_FILTERING` | Set to `1` to disable hardware-based model filtering (e.g., RAM amount, NPU availability) and show all models regardless of system capabilities         |
 | `LEMONADE_ENABLE_DGPU_GTT`         | Set to `1` to include GTT for hardware-based model filtering |
 | `LEMONADE_GLOBAL_TIMEOUT`          | Global default timeout for HTTP requests, inference, and readiness checks in seconds |
+| `LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER` | Automatically unload models after this many seconds of idle time. Set to a positive integer to enable, or `0` to disable. Default: `0` (disabled) |
 
 #### Custom Backend Binaries
 
diff --git a/src/cpp/include/lemon/cli_parser.h b/src/cpp/include/lemon/cli_parser.h
index 882492cfb..eef6a6a61 100644
--- a/src/cpp/include/lemon/cli_parser.h
+++ b/src/cpp/include/lemon/cli_parser.h
@@ -20,6 +20,9 @@ struct ServerConfig {
 
     // Multi-model support: Max loaded models per type slot
     int max_loaded_models = 1;
+
+    // Auto-unload support: Global default timer timeout in seconds after inactivity (0 = disabled)
+    long global_auto_unload_timer = 0;
 };
 
 struct TrayConfig {
diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h
index 414b3dbd7..4c6d3b2ca 100644
--- a/src/cpp/include/lemon/router.h
+++ b/src/cpp/include/lemon/router.h
@@ -5,6 +5,11 @@
 #include <mutex>
 #include <condition_variable>
 #include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <thread>
+#include <chrono>
+#include <functional>
 #include <nlohmann/json.hpp>
 #include <httplib.h>
 #include "wrapped_server.h"
@@ -21,7 +26,8 @@ class Router {
            const std::string& log_level,
            ModelManager* model_manager,
            int max_loaded_models,
-           BackendManager* backend_manager);
+           BackendManager* backend_manager,
+           long auto_unload_timeout);  // Auto-unload timeout in seconds (0 = disabled)
 
     ~Router();
 
@@ -88,7 +94,33 @@ class Router {
     // Update prompt_tokens field from usage
     void update_prompt_tokens(int prompt_tokens);
 
+    // Auto-unload support: Get/set timeout at runtime
+    long get_auto_unload_timeout() const { return auto_unload_timeout_sec_; }
+    void set_auto_unload_timeout(long timeout_seconds);
+
+    // Auto-unload support: Reset timer for a specific model (called on each request)
+    void reset_auto_unload_timer(const std::string& model_name);
+
 private:
+    // Auto-unload timer entry structure
+    struct ModelTimerEntry {
+        // Shared state between Router and timer thread, kept alive by shared_ptr
+        // so the thread can safely detect cancellation even after entry is erased from map
+        struct State {
+            std::mutex mtx;
+            std::condition_variable cv;
+            bool cancelled{false};     // Set by cancel(), wakes the timer thread
+            bool shutdown{false};      // Set during unload to prevent rescheduling
+            std::chrono::seconds timeout_duration{0};
+        };
+        std::shared_ptr<State> state;
+        std::thread timer_thread;
+
+        void start_timer(std::chrono::seconds duration, std::function<void()> on_timeout);
+        void cancel();  // Non-blocking: marks cancelled + notifies cv, no thread join
+        bool is_active() const { return state != nullptr && !state->cancelled; }
+    };
+
     // Multi-model support: Manage multiple WrappedServers
     std::vector<std::unique_ptr<WrappedServer>> loaded_servers_;
 
@@ -106,6 +138,12 @@ class Router {
     bool is_loading_ = false;                    // True when a load operation is in progress
     std::condition_variable load_cv_;            // Signals when load completes
 
+    // Auto-unload support
+    long auto_unload_timeout_sec_ = 0;           // Global timeout in seconds (0 = disabled)
+    mutable std::mutex timers_mutex_;            // Protects model_timers_
+    std::unordered_map<std::string, std::unique_ptr<ModelTimerEntry>> model_timers_;
+    std::atomic<bool> unload_in_progress_{false}; // Prevents new timer creation during unload
+
     // Helper methods for multi-model management
     WrappedServer* find_server_by_model_name(const std::string& model_name) const;
     WrappedServer* get_most_recent_server() const;
@@ -116,10 +154,14 @@ class Router {
     WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const;
     WrappedServer* find_flm_server_by_type(ModelType type) const;
     void evict_all_npu_servers();
-    void evict_server(WrappedServer* server);
+    bool evict_server(WrappedServer* server);  // Returns true if evicted, false if skipped (still busy)
     void evict_all_servers();
     std::unique_ptr<WrappedServer> create_backend_server(const ModelInfo& model_info);
 
+    // Auto-unload helper methods
+    void start_auto_unload_timer(const std::string& model_name, long timeout_seconds);
+    void cancel_auto_unload_timer(const std::string& model_name);
+
     // Generic inference wrapper that handles locking and busy state
     template<typename Func>
     auto execute_inference(const json& request, Func&& inference_func) -> decltype(inference_func(nullptr));
diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h
index b16675d39..5899a8221 100644
--- a/src/cpp/include/lemon/server.h
+++ b/src/cpp/include/lemon/server.h
@@ -30,7 +30,8 @@ class Server {
            int max_loaded_models,
            const std::string& extra_models_dir,
            bool no_broadcast,
-           long http_timeout);
+           long http_timeout,
+           long auto_unload_timeout);
 
     ~Server();
 
diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h
index 75bd1729c..fa414024a 100644
--- a/src/cpp/include/lemon/wrapped_server.h
+++ b/src/cpp/include/lemon/wrapped_server.h
@@ -96,6 +96,16 @@ class WrappedServer : public ICompletionServer {
         }
     }
 
+    // Wait with timeout - returns true if not busy, false if timeout expired
+    bool wait_until_not_busy_with_timeout(std::chrono::seconds timeout) const {
+        std::unique_lock<std::mutex> lock(busy_mutex_);
+        if (!is_busy_) {
+            return true;
+        }
+        bool result = busy_cv_.wait_for(lock, timeout, [this] { return !is_busy_; });
+        return result;
+    }
+
     // Multi-model support: Model metadata
     void set_model_metadata(const std::string& model_name, const std::string& checkpoint,
                            ModelType type, DeviceType device, const RecipeOptions& recipe_options) {
diff --git a/src/cpp/server/cli_parser.cpp b/src/cpp/server/cli_parser.cpp
index 61e360055..eb55934b9 100644
--- a/src/cpp/server/cli_parser.cpp
+++ b/src/cpp/server/cli_parser.cpp
@@ -77,6 +77,19 @@ static void add_serve_options(CLI::App* serve, ServerConfig& config) {
                 return "Value must be a positive integer or -1 for unlimited (got '" + val + "')";
             }
         });
+
+    // Auto-unload support: Global default timer timeout in seconds (0 = disabled, via env var)
+    if (auto env_val = std::getenv("LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER")) {
+        try {
+            config.global_auto_unload_timer = std::stol(env_val);
+            if (config.global_auto_unload_timer < 0) {
+                config.global_auto_unload_timer = 0;
+            }
+        } catch (...) {
+            config.global_auto_unload_timer = 0;
+        }
+    }
+
     RecipeOptions::add_cli_options(*serve, config.recipe_options);
 }
 
diff --git a/src/cpp/server/main.cpp b/src/cpp/server/main.cpp
index 29296d5ad..5be4f0609 100644
--- a/src/cpp/server/main.cpp
+++ b/src/cpp/server/main.cpp
@@ -79,7 +79,7 @@ int main(int argc, char** argv) {
         Server server(config.port, config.host, config.log_level,
                     config.recipe_options, config.max_loaded_models,
                     config.extra_models_dir, config.no_broadcast,
-                    config.global_timeout);
+                    config.global_timeout, config.global_auto_unload_timer);
 
         // Register signal handler for Ctrl+C
         g_server_instance = &server;
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index 99b9334de..6040494f4 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -15,15 +15,23 @@
 namespace lemon {
 
 Router::Router(const json& default_options, const std::string& log_level, ModelManager* model_manager,
-               int max_loaded_models, BackendManager* backend_manager)
+               int max_loaded_models, BackendManager* backend_manager,
+               long auto_unload_timeout)
     : default_options_(default_options), log_level_(log_level), model_manager_(model_manager),
-      max_loaded_models_(max_loaded_models), backend_manager_(backend_manager) {
+      max_loaded_models_(max_loaded_models), backend_manager_(backend_manager),
+      auto_unload_timeout_sec_(auto_unload_timeout) {
 
     if (max_loaded_models_ == -1) {
     LOG(DEBUG, "Router") << "Max loaded models per type: unlimited" << std::endl;
     } else {
     LOG(DEBUG, "Router") << "Max loaded models per type: " << max_loaded_models_ << std::endl;
     }
+
+    if (auto_unload_timeout_sec_ > 0) {
+        LOG(INFO, "Router") << "Auto-unload timeout enabled: " << auto_unload_timeout_sec_ << " seconds" << std::endl;
+    } else {
+        LOG(DEBUG, "Router") << "Auto-unload timeout disabled" << std::endl;
+    }
 }
 
 Router::~Router() {
@@ -31,6 +39,170 @@ Router::~Router() {
     unload_model("");  // Unload all
 }
 
+// ============================================================================
+// Auto-unload timer methods
+// ============================================================================
+
+void Router::ModelTimerEntry::start_timer(std::chrono::seconds duration,
+                                          std::function<void()> on_timeout) {
+    state = std::make_shared<State>();
+    state->timeout_duration = duration;
+
+    // Capture shared_ptr by value to keep State alive for thread's lifetime
+    auto s = state;
+    timer_thread = std::thread([s, duration, on_timeout]() {
+        // Use condition_variable for interruptible sleep
+        {
+            std::unique_lock<std::mutex> lock(s->mtx);
+            s->cv.wait_for(lock, duration, [s] { return s->cancelled; });
+        }
+
+        // Check if cancelled or shutdown before executing callback
+        if (s->cancelled || s->shutdown || !on_timeout) {
+            return;
+        }
+
+        try {
+            on_timeout();
+        } catch (const std::exception& e) {
+            // Log but don't crash the timer thread
+        } catch (...) {
+            // Catch any other exceptions
+        }
+    });
+}
+
+void Router::ModelTimerEntry::cancel() {
+    if (state) {
+        std::lock_guard<std::mutex> lock(state->mtx);
+        state->cancelled = true;
+        state->cv.notify_one();
+    }
+    // Non-blocking: does NOT join the thread.
+    // The thread will exit on its own when it wakes and sees cancelled=true.
+    // Thread is detached to avoid "thread not joined" std::terminate.
+    if (timer_thread.joinable()) {
+        timer_thread.detach();
+    }
+}
+
+void Router::set_auto_unload_timeout(long timeout_seconds) {
+    std::lock_guard<std::mutex> lock(timers_mutex_);
+    auto_unload_timeout_sec_ = timeout_seconds;
+
+    if (auto_unload_timeout_sec_ > 0) {
+        LOG(INFO, "Router") << "Auto-unload timeout set to: " << auto_unload_timeout_sec_ << " seconds" << std::endl;
+    } else {
+        LOG(INFO, "Router") << "Auto-unload timeout disabled" << std::endl;
+        // Cancel all existing timers
+        for (auto& [model_name, timer_entry] : model_timers_) {
+            if (timer_entry) {
+                timer_entry->cancel();
+            }
+        }
+        model_timers_.clear();
+    }
+}
+
+void Router::reset_auto_unload_timer(const std::string& model_name) {
+    if (auto_unload_timeout_sec_ <= 0) {
+        return;  // Auto-unload disabled
+    }
+
+    // Don't create new timers during unload (prevents race with unload_model)
+    if (unload_in_progress_.load()) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(timers_mutex_);
+    auto it = model_timers_.find(model_name);
+    
+    // Cancel existing timer if present (direct flag set + notify, no lock re-entry)
+    if (it != model_timers_.end() && it->second && it->second->state) {
+        auto& s = it->second->state;
+        {
+            std::lock_guard<std::mutex> state_lock(s->mtx);
+            s->cancelled = true;
+        }
+        s->cv.notify_one();
+        if (it->second->timer_thread.joinable()) {
+            it->second->timer_thread.detach();
+        }
+    }
+    
+    // Always create a new timer
+    auto timer_entry = std::make_unique<ModelTimerEntry>();
+    timer_entry->start_timer(
+        std::chrono::seconds(auto_unload_timeout_sec_),
+        [this, model_name]() {
+            LOG(INFO, "Router") << "Auto-unload timeout expired for: " << model_name << std::endl;
+            try {
+                unload_model(model_name);
+            } catch (const std::exception& e) {
+                LOG(DEBUG, "Router") << "Auto-unload skipped: " << e.what() << std::endl;
+            }
+        }
+    );
+    model_timers_[model_name] = std::move(timer_entry);
+    
+    LOG(DEBUG, "Router") << "Reset auto-unload timer for: " << model_name << std::endl;
+}
+
+void Router::start_auto_unload_timer(const std::string& model_name, long timeout_seconds) {
+    if (timeout_seconds <= 0) {
+        return;  // Auto-unload disabled
+    }
+
+    // Don't create new timers during unload (prevents race with unload_model)
+    if (unload_in_progress_.load()) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(timers_mutex_);
+
+    // Cancel any existing timer for this model (direct flag set, no lock re-entry)
+    auto existing_it = model_timers_.find(model_name);
+    if (existing_it != model_timers_.end() && existing_it->second && existing_it->second->state) {
+        auto& s = existing_it->second->state;
+        {
+            std::lock_guard<std::mutex> state_lock(s->mtx);
+            s->cancelled = true;
+        }
+        s->cv.notify_one();
+        if (existing_it->second->timer_thread.joinable()) {
+            existing_it->second->timer_thread.detach();
+        }
+    }
+
+    // Create new timer entry
+    auto timer_entry = std::make_unique<ModelTimerEntry>();
+    timer_entry->start_timer(
+        std::chrono::seconds(timeout_seconds),
+        [this, model_name]() {
+            LOG(INFO, "Router") << "Auto-unload timeout expired for: " << model_name << std::endl;
+            try {
+                unload_model(model_name);
+            } catch (const std::exception& e) {
+                LOG(DEBUG, "Router") << "Auto-unload skipped: " << e.what() << std::endl;
+            }
+        }
+    );
+
+    model_timers_[model_name] = std::move(timer_entry);
+    LOG(DEBUG, "Router") << "Started auto-unload timer for: " << model_name
+                         << " (" << timeout_seconds << " seconds)" << std::endl;
+}
+
+void Router::cancel_auto_unload_timer(const std::string& model_name) {
+    std::lock_guard<std::mutex> lock(timers_mutex_);
+    auto it = model_timers_.find(model_name);
+    if (it != model_timers_.end() && it->second) {
+        it->second->cancel();  // Non-blocking: sets flag + detaches thread
+        model_timers_.erase(it);
+        LOG(DEBUG, "Router") << "Cancelled auto-unload timer for: " << model_name << std::endl;
+    }
+}
+
 WrappedServer* Router::find_server_by_model_name(const std::string& model_name) const {
     for (const auto& server : loaded_servers_) {
         if (server->get_model_name() == model_name) {
@@ -128,19 +300,32 @@ void Router::evict_all_npu_servers() {
     }
     for (auto* server : npu_servers) {
         LOG(INFO, "Router") << "Evicting NPU server: " << server->get_model_name() << std::endl;
-        evict_server(server);
+        evict_server(server);  // Ignore return value for bulk eviction
     }
 }
 
 // Helper: Evict a specific server
-void Router::evict_server(WrappedServer* server) {
-    if (!server) return;
+// Returns true if successfully evicted, false if skipped (still busy)
+// NOTE: Called with load_mutex_ held. Timer cancellation must be non-blocking.
+bool Router::evict_server(WrappedServer* server) {
+    if (!server) return true;
 
     std::string model_name = server->get_model_name();
     LOG(INFO, "Router") << "Evicting model: " << model_name << std::endl;
 
-    // Wait for any ongoing inference to complete
-    server->wait_until_not_busy();
+    // Cancel auto-unload timer for this model (non-blocking, safe under load_mutex_)
+    if (auto_unload_timeout_sec_ > 0) {
+        cancel_auto_unload_timer(model_name);
+    }
+
+    // Wait for any ongoing inference to complete (with timeout to avoid deadlock)
+    // Use 30 second timeout - if model is still busy after this, skip unload
+    const std::chrono::seconds wait_timeout(30);
+    if (!server->wait_until_not_busy_with_timeout(wait_timeout)) {
+        LOG(WARNING, "Router") << "Model still busy after " << wait_timeout.count() 
+                               << " seconds, skipping unload: " << model_name << std::endl;
+        return false;  // Don't unload - caller should reschedule the timer
+    }
 
     // Unload the server
     server->unload();
@@ -155,19 +340,34 @@ void Router::evict_server(WrappedServer* server) {
     );
 
     LOG(INFO, "Router") << "Evicted model: " << model_name << std::endl;
+    return true;
 }
 
 void Router::evict_all_servers() {
     LOG(INFO, "Router") << "Evicting all models (" << loaded_servers_.size() << " total)" << std::endl;
 
-    // Wait for all servers to finish
+    // Cancel all auto-unload timers first (non-blocking, safe under load_mutex_)
+    if (auto_unload_timeout_sec_ > 0) {
+        std::lock_guard<std::mutex> timer_lock(timers_mutex_);
+        for (auto& [name, timer_entry] : model_timers_) {
+            if (timer_entry) {
+                timer_entry->cancel();
+            }
+        }
+        model_timers_.clear();
+    }
+
+    // Wait for all servers to finish (with timeout to avoid indefinite blocking)
+    const std::chrono::seconds wait_timeout(30);
     for (const auto& server : loaded_servers_) {
-        server->wait_until_not_busy();
+        if (!server->wait_until_not_busy_with_timeout(wait_timeout)) {
+            LOG(WARNING, "Router") << "Model still busy after timeout: " << server->get_model_name() << std::endl;
+        }
     }
 
     // Unload all
     for (const auto& server : loaded_servers_) {
-    LOG(INFO, "Router") << "Unloading: " << server->get_model_name() << std::endl;
+        LOG(INFO, "Router") << "Unloading: " << server->get_model_name() << std::endl;
         server->unload();
     }
 
@@ -344,6 +544,11 @@ void Router::load_model(const std::string& model_name,
 
         LOG(INFO, "Router") << "Model loaded successfully. Total loaded: "
                       << loaded_servers_.size() << std::endl;
+
+        // Start auto-unload timer if enabled
+        if (auto_unload_timeout_sec_ > 0) {
+            start_auto_unload_timer(model_name, auto_unload_timeout_sec_);
+        }
         } else {
             // ERROR HANDLING (from spec: Error Handling section)
             // Check if error is "file not found" (exception to nuclear policy)
@@ -386,6 +591,11 @@ void Router::load_model(const std::string& model_name,
                 load_cv_.notify_all();
 
             LOG(DEBUG, "Router") << "Retry successful!" << std::endl;
+
+            // Start auto-unload timer if enabled
+            if (auto_unload_timeout_sec_ > 0) {
+                start_auto_unload_timer(model_name, auto_unload_timeout_sec_);
+            }
             } catch (const std::exception& retry_error) {
                 lock.lock();
                 is_loading_ = false;
@@ -410,21 +620,48 @@ void Router::load_model(const std::string& model_name,
 }
 
 void Router::unload_model(const std::string& model_name) {
-    std::lock_guard<std::mutex> lock(load_mutex_);
+    // Set flag to prevent new timer creation during unload
+    unload_in_progress_ = true;
 
-    if (model_name.empty()) {
-        // Unload all models
-    LOG(INFO, "Router") << "Unload all models called" << std::endl;
-        evict_all_servers();
+    // Cancel auto-unload timer(s) BEFORE acquiring load_mutex_
+    // This prevents deadlock with timer thread (which also needs load_mutex_)
+    if (!model_name.empty()) {
+        cancel_auto_unload_timer(model_name);
     } else {
-        // Unload specific model
-    LOG(INFO, "Router") << "Unload model called: " << model_name << std::endl;
-        WrappedServer* server = find_server_by_model_name(model_name);
-        if (!server) {
-            throw std::runtime_error("Model not loaded: " + model_name);
+        // Cancel all timers when unloading all models
+        std::lock_guard<std::mutex> timer_lock(timers_mutex_);
+        for (auto& [name, timer_entry] : model_timers_) {
+            if (timer_entry) {
+                timer_entry->cancel();
+            }
         }
-        evict_server(server);
+        model_timers_.clear();
     }
+
+    std::lock_guard<std::mutex> lock(load_mutex_);
+
+    try {
+        if (model_name.empty()) {
+            // Unload all models
+            LOG(INFO, "Router") << "Unload all models called" << std::endl;
+            evict_all_servers();
+        } else {
+            // Unload specific model
+            LOG(INFO, "Router") << "Unload model called: " << model_name << std::endl;
+            WrappedServer* server = find_server_by_model_name(model_name);
+            if (!server) {
+                throw std::runtime_error("Model not loaded: " + model_name);
+            }
+            if (!evict_server(server)) {
+                throw std::runtime_error("Model is busy and could not be unloaded within timeout: " + model_name);
+            }
+        }
+    } catch (...) {
+        unload_in_progress_ = false;
+        throw;
+    }
+
+    unload_in_progress_ = false;
 }
 
 std::string Router::get_loaded_model() const {
@@ -509,12 +746,12 @@ std::string Router::get_backend_address() const {
 template<typename Func>
 auto Router::execute_inference(const json& request, Func&& inference_func) -> decltype(inference_func(nullptr)) {
     WrappedServer* server = nullptr;
+    std::string requested_model;
 
     {
         std::lock_guard<std::mutex> lock(load_mutex_);
 
         // Extract model from request - required field, no fallback to avoid silent misrouting
-        std::string requested_model;
         if (request.contains("model") && request["model"].is_string()) {
             requested_model = request["model"].get<std::string>();
         }
@@ -528,6 +765,11 @@ auto Router::execute_inference(const json& request, Func&& inference_func) -> de
             return ErrorResponse::from_exception(ModelNotLoadedException(requested_model));
         }
 
+        // Cancel auto-unload timer while processing request
+        if (auto_unload_timeout_sec_ > 0) {
+            cancel_auto_unload_timer(requested_model);
+        }
+
         // Mark as busy and update access time
         server->set_busy(true);
         server->update_access_time();
@@ -537,9 +779,21 @@ auto Router::execute_inference(const json& request, Func&& inference_func) -> de
     try {
         auto response = inference_func(server);
         server->set_busy(false);
+
+        // Reset auto-unload timer when model becomes idle (after request completes)
+        if (!requested_model.empty() && auto_unload_timeout_sec_ > 0) {
+            reset_auto_unload_timer(requested_model);
+        }
+
         return response;
     } catch (...) {
         server->set_busy(false);
+
+        // Reset auto-unload timer when model becomes idle (after error)
+        if (!requested_model.empty() && auto_unload_timeout_sec_ > 0) {
+            reset_auto_unload_timer(requested_model);
+        }
+
         throw;
     }
 }
@@ -548,12 +802,12 @@ auto Router::execute_inference(const json& request, Func&& inference_func) -> de
 template<typename Func>
 void Router::execute_streaming(const std::string& request_body, httplib::DataSink& sink, Func&& streaming_func) {
     WrappedServer* server = nullptr;
+    std::string requested_model;
 
     {
         std::lock_guard<std::mutex> lock(load_mutex_);
 
         // Extract model from request body if present (same logic as execute_inference)
-        std::string requested_model;
         try {
             json request = json::parse(request_body);
             if (request.contains("model") && request["model"].is_string()) {
@@ -579,6 +833,11 @@ void Router::execute_streaming(const std::string& request_body, httplib::DataSin
             return;
         }
 
+        // Cancel auto-unload timer while processing request
+        if (auto_unload_timeout_sec_ > 0) {
+            cancel_auto_unload_timer(requested_model);
+        }
+
         server->set_busy(true);
         server->update_access_time();
     }
@@ -586,8 +845,19 @@ void Router::execute_streaming(const std::string& request_body, httplib::DataSin
     try {
         streaming_func(server);
         server->set_busy(false);
+
+        // Reset auto-unload timer when model becomes idle (after streaming completes)
+        if (!requested_model.empty() && auto_unload_timeout_sec_ > 0) {
+            reset_auto_unload_timer(requested_model);
+        }
     } catch (...) {
         server->set_busy(false);
+
+        // Reset auto-unload timer when model becomes idle (after error)
+        if (!requested_model.empty() && auto_unload_timeout_sec_ > 0) {
+            reset_auto_unload_timer(requested_model);
+        }
+
         throw;
     }
 }
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 4c8d837d4..2fc6be7ba 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -67,7 +67,7 @@ static const json MIME_TYPES = {
 Server::Server(int port, const std::string& host, const std::string& log_level,
                const json& default_options, int max_loaded_models,
                const std::string& extra_models_dir, bool no_broadcast,
-               long global_timeout)
+               long global_timeout, long auto_unload_timeout)
     : port_(port), host_(host), log_level_(log_level), default_options_(default_options),
       no_broadcast_(no_broadcast), running_(false), udp_beacon_() {
 
@@ -112,9 +112,22 @@ Server::Server(int port, const std::string& host, const std::string& log_level,
 
     backend_manager_ = std::make_unique<BackendManager>();
 
+    // Read global auto-unload timer from environment variable
+    long global_auto_unload_timer = 0;
+    if (auto env_val = std::getenv("LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER")) {
+        try {
+            global_auto_unload_timer = std::stol(env_val);
+            if (global_auto_unload_timer < 0) {
+                global_auto_unload_timer = 0;
+            }
+        } catch (...) {
+            global_auto_unload_timer = 0;
+        }
+    }
+
     router_ = std::make_unique<Router>(default_options_, log_level_,
                                        model_manager_.get(), max_loaded_models,
-                                       backend_manager_.get());
+                                       backend_manager_.get(), global_auto_unload_timer);
 
     LOG(DEBUG, "Server") << "Debug logging enabled - subprocess output will be visible" << std::endl;