lemonade-sdk · jaeiclee · Mar 17, 2026
diff --git a/docs/server/lemonade-server-cli.md b/docs/server/lemonade-server-cli.md
@@ -93,6 +93,7 @@ These settings can also be provided via environment variables that Lemonade Serv
 | `LEMONADE_DISABLE_MODEL_FILTERING` | Set to `1` to disable hardware-based model filtering (e.g., RAM amount, NPU availability) and show all models regardless of system capabilities         |
 | `LEMONADE_ENABLE_DGPU_GTT`         | Set to `1` to include GTT for hardware-based model filtering |
 | `LEMONADE_GLOBAL_TIMEOUT`          | Global default timeout for HTTP requests, inference, and readiness checks in seconds |
+| `LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER` | Automatically unload models after this many seconds of idle time. Set to a positive integer to enable, or `0` to disable. Default: `0` (disabled) |
 
 #### Custom Backend Binaries
 

diff --git a/src/cpp/include/lemon/cli_parser.h b/src/cpp/include/lemon/cli_parser.h
@@ -20,6 +20,9 @@ struct ServerConfig {
 
     // Multi-model support: Max loaded models per type slot
     int max_loaded_models = 1;
+
+    // Auto-unload support: Global default timer timeout in seconds after inactivity (0 = disabled)
+    long global_auto_unload_timer = 0;
 };
 
 struct TrayConfig {

diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h
@@ -5,6 +5,11 @@
 #include <mutex>
 #include <condition_variable>
 #include <vector>
+#include <unordered_map>
+#include <atomic>
+#include <thread>
+#include <chrono>
+#include <functional>
 #include <nlohmann/json.hpp>
 #include <httplib.h>
 #include "wrapped_server.h"
@@ -21,7 +26,8 @@ class Router {
            const std::string& log_level,
            ModelManager* model_manager,
            int max_loaded_models,
-           BackendManager* backend_manager);
+           BackendManager* backend_manager,
+           long auto_unload_timeout);  // Auto-unload timeout in seconds (0 = disabled)
 
     ~Router();
 
@@ -88,7 +94,33 @@ class Router {
     // Update prompt_tokens field from usage
     void update_prompt_tokens(int prompt_tokens);
 
+    // Auto-unload support: Get/set timeout at runtime
+    long get_auto_unload_timeout() const { return auto_unload_timeout_sec_; }
+    void set_auto_unload_timeout(long timeout_seconds);
+
+    // Auto-unload support: Reset timer for a specific model (called on each request)
+    void reset_auto_unload_timer(const std::string& model_name);
+
 private:
+    // Auto-unload timer entry structure
+    struct ModelTimerEntry {
+        // Shared state between Router and timer thread, kept alive by shared_ptr
+        // so the thread can safely detect cancellation even after entry is erased from map
+        struct State {
+            std::mutex mtx;
+            std::condition_variable cv;
+            bool cancelled{false};     // Set by cancel(), wakes the timer thread
+            bool shutdown{false};      // Set during unload to prevent rescheduling
+            std::chrono::seconds timeout_duration{0};
+        };
+        std::shared_ptr<State> state;
+        std::thread timer_thread;
+
+        void start_timer(std::chrono::seconds duration, std::function<void()> on_timeout);
+        void cancel();  // Non-blocking: marks cancelled + notifies cv, no thread join
+        bool is_active() const { return state != nullptr && !state->cancelled; }
+    };
+
     // Multi-model support: Manage multiple WrappedServers
     std::vector<std::unique_ptr<WrappedServer>> loaded_servers_;
 
@@ -106,6 +138,12 @@ class Router {
     bool is_loading_ = false;                    // True when a load operation is in progress
     std::condition_variable load_cv_;            // Signals when load completes
 
+    // Auto-unload support
+    long auto_unload_timeout_sec_ = 0;           // Global timeout in seconds (0 = disabled)
+    mutable std::mutex timers_mutex_;            // Protects model_timers_
+    std::unordered_map<std::string, std::unique_ptr<ModelTimerEntry>> model_timers_;
+    std::atomic<bool> unload_in_progress_{false}; // Prevents new timer creation during unload
+
     // Helper methods for multi-model management
     WrappedServer* find_server_by_model_name(const std::string& model_name) const;
     WrappedServer* get_most_recent_server() const;
@@ -116,10 +154,14 @@ class Router {
     WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const;
     WrappedServer* find_flm_server_by_type(ModelType type) const;
     void evict_all_npu_servers();
-    void evict_server(WrappedServer* server);
+    bool evict_server(WrappedServer* server);  // Returns true if evicted, false if skipped (still busy)
     void evict_all_servers();
     std::unique_ptr<WrappedServer> create_backend_server(const ModelInfo& model_info);
 
+    // Auto-unload helper methods
+    void start_auto_unload_timer(const std::string& model_name, long timeout_seconds);
+    void cancel_auto_unload_timer(const std::string& model_name);
+
     // Generic inference wrapper that handles locking and busy state
     template<typename Func>
     auto execute_inference(const json& request, Func&& inference_func) -> decltype(inference_func(nullptr));

diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h
@@ -30,7 +30,8 @@ class Server {
            int max_loaded_models,
            const std::string& extra_models_dir,
            bool no_broadcast,
-           long http_timeout);
+           long http_timeout,
+           long auto_unload_timeout);
 
     ~Server();
 

diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h
@@ -96,6 +96,16 @@ class WrappedServer : public ICompletionServer {
         }
     }
 
+    // Wait with timeout - returns true if not busy, false if timeout expired
+    bool wait_until_not_busy_with_timeout(std::chrono::seconds timeout) const {
+        std::unique_lock<std::mutex> lock(busy_mutex_);
+        if (!is_busy_) {
+            return true;
+        }
+        bool result = busy_cv_.wait_for(lock, timeout, [this] { return !is_busy_; });
+        return result;
+    }
+
     // Multi-model support: Model metadata
     void set_model_metadata(const std::string& model_name, const std::string& checkpoint,
                            ModelType type, DeviceType device, const RecipeOptions& recipe_options) {

diff --git a/src/cpp/server/cli_parser.cpp b/src/cpp/server/cli_parser.cpp
@@ -77,6 +77,19 @@ static void add_serve_options(CLI::App* serve, ServerConfig& config) {
                 return "Value must be a positive integer or -1 for unlimited (got '" + val + "')";
             }
         });
+
+    // Auto-unload support: Global default timer timeout in seconds (0 = disabled, via env var)
+    if (auto env_val = std::getenv("LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER")) {
+        try {
+            config.global_auto_unload_timer = std::stol(env_val);
+            if (config.global_auto_unload_timer < 0) {
+                config.global_auto_unload_timer = 0;
+            }
+        } catch (...) {
+            config.global_auto_unload_timer = 0;
+        }
+    }
+
     RecipeOptions::add_cli_options(*serve, config.recipe_options);
 }
 

diff --git a/src/cpp/server/main.cpp b/src/cpp/server/main.cpp
@@ -79,7 +79,7 @@ int main(int argc, char** argv) {
         Server server(config.port, config.host, config.log_level,
                     config.recipe_options, config.max_loaded_models,
                     config.extra_models_dir, config.no_broadcast,
-                    config.global_timeout);
+                    config.global_timeout, config.global_auto_unload_timer);
 
         // Register signal handler for Ctrl+C
         g_server_instance = &server;