Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/server/lemonade-server-cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ These settings can also be provided via environment variables that Lemonade Serv
| `LEMONADE_DISABLE_MODEL_FILTERING` | Set to `1` to disable hardware-based model filtering (e.g., RAM amount, NPU availability) and show all models regardless of system capabilities |
| `LEMONADE_ENABLE_DGPU_GTT` | Set to `1` to include GTT for hardware-based model filtering |
| `LEMONADE_GLOBAL_TIMEOUT` | Global default timeout for HTTP requests, inference, and readiness checks in seconds |
| `LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER` | Automatically unload models after this many seconds of idle time. Set to a positive integer to enable, or `0` to disable. Default: `0` (disabled) |

#### Custom Backend Binaries

Expand Down
3 changes: 3 additions & 0 deletions src/cpp/include/lemon/cli_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ struct ServerConfig {

// Multi-model support: Max loaded models per type slot
int max_loaded_models = 1;

// Auto-unload support: Global default timer timeout in seconds after inactivity (0 = disabled)
long global_auto_unload_timer = 0;
};

struct TrayConfig {
Expand Down
46 changes: 44 additions & 2 deletions src/cpp/include/lemon/router.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
#include <mutex>
#include <condition_variable>
#include <vector>
#include <unordered_map>
#include <atomic>
#include <thread>
#include <chrono>
#include <functional>
#include <nlohmann/json.hpp>
#include <httplib.h>
#include "wrapped_server.h"
Expand All @@ -21,7 +26,8 @@ class Router {
const std::string& log_level,
ModelManager* model_manager,
int max_loaded_models,
BackendManager* backend_manager);
BackendManager* backend_manager,
long auto_unload_timeout); // Auto-unload timeout in seconds (0 = disabled)

~Router();

Expand Down Expand Up @@ -88,7 +94,33 @@ class Router {
// Update prompt_tokens field from usage
void update_prompt_tokens(int prompt_tokens);

// Auto-unload support: Get/set timeout at runtime
long get_auto_unload_timeout() const { return auto_unload_timeout_sec_; }
void set_auto_unload_timeout(long timeout_seconds);

// Auto-unload support: Reset timer for a specific model (called on each request)
void reset_auto_unload_timer(const std::string& model_name);

private:
// Auto-unload timer entry structure
struct ModelTimerEntry {
// Shared state between Router and timer thread, kept alive by shared_ptr
// so the thread can safely detect cancellation even after entry is erased from map
struct State {
std::mutex mtx;
std::condition_variable cv;
bool cancelled{false}; // Set by cancel(), wakes the timer thread
bool shutdown{false}; // Set during unload to prevent rescheduling
std::chrono::seconds timeout_duration{0};
};
std::shared_ptr<State> state;
std::thread timer_thread;

void start_timer(std::chrono::seconds duration, std::function<void()> on_timeout);
void cancel(); // Non-blocking: marks cancelled + notifies cv, no thread join
bool is_active() const { return state != nullptr && !state->cancelled; }
};

// Multi-model support: Manage multiple WrappedServers
std::vector<std::unique_ptr<WrappedServer>> loaded_servers_;

Expand All @@ -106,6 +138,12 @@ class Router {
bool is_loading_ = false; // True when a load operation is in progress
std::condition_variable load_cv_; // Signals when load completes

// Auto-unload support
long auto_unload_timeout_sec_ = 0; // Global timeout in seconds (0 = disabled)
mutable std::mutex timers_mutex_; // Protects model_timers_
std::unordered_map<std::string, std::unique_ptr<ModelTimerEntry>> model_timers_;
std::atomic<bool> unload_in_progress_{false}; // Prevents new timer creation during unload

// Helper methods for multi-model management
WrappedServer* find_server_by_model_name(const std::string& model_name) const;
WrappedServer* get_most_recent_server() const;
Expand All @@ -116,10 +154,14 @@ class Router {
WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const;
WrappedServer* find_flm_server_by_type(ModelType type) const;
void evict_all_npu_servers();
void evict_server(WrappedServer* server);
bool evict_server(WrappedServer* server); // Returns true if evicted, false if skipped (still busy)
void evict_all_servers();
std::unique_ptr<WrappedServer> create_backend_server(const ModelInfo& model_info);

// Auto-unload helper methods
void start_auto_unload_timer(const std::string& model_name, long timeout_seconds);
void cancel_auto_unload_timer(const std::string& model_name);

// Generic inference wrapper that handles locking and busy state
template<typename Func>
auto execute_inference(const json& request, Func&& inference_func) -> decltype(inference_func(nullptr));
Expand Down
3 changes: 2 additions & 1 deletion src/cpp/include/lemon/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ class Server {
int max_loaded_models,
const std::string& extra_models_dir,
bool no_broadcast,
long http_timeout);
long http_timeout,
long auto_unload_timeout);

~Server();

Expand Down
10 changes: 10 additions & 0 deletions src/cpp/include/lemon/wrapped_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,16 @@ class WrappedServer : public ICompletionServer {
}
}

// Wait with timeout - returns true if not busy, false if timeout expired
bool wait_until_not_busy_with_timeout(std::chrono::seconds timeout) const {
std::unique_lock<std::mutex> lock(busy_mutex_);
if (!is_busy_) {
return true;
}
bool result = busy_cv_.wait_for(lock, timeout, [this] { return !is_busy_; });
return result;
}

// Multi-model support: Model metadata
void set_model_metadata(const std::string& model_name, const std::string& checkpoint,
ModelType type, DeviceType device, const RecipeOptions& recipe_options) {
Expand Down
13 changes: 13 additions & 0 deletions src/cpp/server/cli_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@ static void add_serve_options(CLI::App* serve, ServerConfig& config) {
return "Value must be a positive integer or -1 for unlimited (got '" + val + "')";
}
});

// Auto-unload support: Global default timer timeout in seconds (0 = disabled, via env var)
if (auto env_val = std::getenv("LEMONADE_GLOBAL_AUTO_UNLOAD_TIMER")) {
try {
config.global_auto_unload_timer = std::stol(env_val);
if (config.global_auto_unload_timer < 0) {
config.global_auto_unload_timer = 0;
}
} catch (...) {
config.global_auto_unload_timer = 0;
}
}

RecipeOptions::add_cli_options(*serve, config.recipe_options);
}

Expand Down
2 changes: 1 addition & 1 deletion src/cpp/server/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ int main(int argc, char** argv) {
Server server(config.port, config.host, config.log_level,
config.recipe_options, config.max_loaded_models,
config.extra_models_dir, config.no_broadcast,
config.global_timeout);
config.global_timeout, config.global_auto_unload_timer);

// Register signal handler for Ctrl+C
g_server_instance = &server;
Expand Down
Loading