lemonade-sdk · genrtul · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 20, 2026
diff --git a/docs/server/custom-models.md b/docs/server/custom-models.md
@@ -141,6 +141,7 @@ This file configures per-model runtime settings. Each key is a **full model name
 |--------|---------|-------------|-------------|
 | `ctx_size` | 4096 | `LEMONADE_CTX_SIZE` | Context window size in tokens |
 | `llamacpp_backend` | vulkan (Windows/Linux), metal (macOS) | `LEMONADE_LLAMACPP` | Inference backend: `vulkan`, `rocm`, `cpu`, `metal` |
+| `llamacpp_device` | (empty) | LEMONADE_LLAMACPP_DEVICE | Comma-separated list of accelerator devices to use (e.g. Vulkan0) |
 | `llamacpp_args` | (empty) | `LEMONADE_LLAMACPP_ARGS` | Extra arguments passed to llama-server |
 
 #### whispercpp

diff --git a/docs/server/lemonade-cli-experimental.md b/docs/server/lemonade-cli-experimental.md
@@ -260,6 +260,7 @@ The following options are available depending on the recipe being used:
 |--------|-------------|---------|
 | `--ctx-size SIZE` | Context size for the model | `4096` |
 | `--llamacpp BACKEND` | LlamaCpp backend to use | Auto-detected |
+| `--llamacpp-device DEVICE` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | (empty) |
 | `--llamacpp-args ARGS` | Custom arguments to pass to llama-server (must not conflict with managed args) | `""` |
 
 #### FLM (`flm` recipe)

diff --git a/docs/server/lemonade-server-cli.md b/docs/server/lemonade-server-cli.md
@@ -56,6 +56,7 @@ lemonade-server run MODEL_NAME [options]
 | `--llamacpp [vulkan\|rocm\cpu]`    | Default LlamaCpp backend to use when loading models. Can be overridden per-model via the `/api/v1/load` endpoint. | vulkan |
 | `--ctx-size [size]`            | Default context size for models. For llamacpp recipes, this sets the `--ctx-size` parameter for the llama server. For other recipes, prompts exceeding this size will be truncated. Can be overridden per-model via the `/api/v1/load` endpoint. | 4096 |
 | `--llamacpp-args [args]`       | Default custom arguments to pass to llama-server. Must not conflict with arguments managed by Lemonade (e.g., `-m`, `--port`, `--ctx-size`, `-ngl`). Can be overridden per-model via the `/api/v1/load` endpoint. Example: `--llamacpp-args "--flash-attn on --no-mmap"` | "" |
+| `--llamacpp-device [args]`     | comma-separated list of devices to use for offloading (none = don't offload). Corresponds to `llama-server`'s `--device` option. Examples: `Vulkan0`, `Rocm0,Rocm1` | None |
 | `--whispercpp-args [args]`     | Default custom arguments to pass to whisper-server. Must not conflict with arguments managed by Lemonade (currently `-m`, `--model`, and `--port`). Can be overridden per-model via the `/api/v1/load` endpoint. Example: `--whispercpp-args "--convert"` | "" |
 | `--flm-args [args]`            | Custom arguments to pass to FLM (FastFlowLM) server. Must not conflict with arguments managed by Lemonade (e.g., `--host`, `--port`, `--ctx-len`). Commonly used for NPU concurrency tuning. Can be overridden per-model via the `/api/v1/load` endpoint. Example: `--flm-args "-s 20 -q 15"` (socket connections and queue length). | "" |
 | `--extra-models-dir [path]`    | Experimental feature. Secondary directory to scan for LLM GGUF model files. Audio, embedding, reranking, and non-GGUF files are not supported, yet. | None |

diff --git a/docs/server/server_spec.md b/docs/server/server_spec.md
@@ -133,7 +133,7 @@ Each type has its own independent LRU cache, all sharing the same slot limit set
 - **NPU Exclusivity:** `flm`, `ryzenai-llm`, and `whispercpp` are mutually exclusive on the NPU.
     - Loading a model from one of these backends will automatically evict all NPU models from the other backends.
     - `flm` supports loading 1 ASR model, 1 LLM, and 1 embedding model on the NPU at the same time.
-    - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU. 
+    - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU.
     - `whispercpp` supports loading exactly 1 ASR model at a time, which uses the entire NPU.
 - **CPU/GPU:** No inherent limits beyond available RAM. Multiple models can coexist on CPU or GPU.
 
@@ -1266,7 +1266,8 @@ Explicitly load a registered model into memory. This is useful to ensure that th
 | `save_options` | No | All | Boolean. If true, saves recipe options to `recipe_options.json`. Any previously stored value for `model_name` is replaced. |
 | `ctx_size` | No | llamacpp, flm, ryzenai-llm | Context size for the model. Overrides the default value. |
 | `llamacpp_backend` | No | llamacpp | LlamaCpp backend to use (`vulkan`, `rocm`, `metal` or `cpu`). |
-| `llamacpp_args` | No | llamacpp | Custom arguments to pass to llama-server. The following are NOT allowed: `-m`, `--port`, `--ctx-size`, `-ngl`, `--jinja`, `--mmproj`, `--embeddings`, `--reranking`. |
+| `llamacpp_device` | No | llamacpp | Comma-separated list of accelerator devices to use (e.g. Vulkan0) |
+| `llamacpp_args` | No | llamacpp | Custom arguments to pass to llama-server. The following are NOT allowed: `-m`, `--port`, `--ctx-size`, `--device`, `-ngl`, `--jinja`, `--mmproj`, `--embeddings`, `--reranking`. |
 | `whispercpp_backend` | No | whispercpp | WhisperCpp backend: `npu` or `cpu` on Windows; `cpu` or `vulkan` on Linux. Default is `npu` if supported. |
 | `whispercpp_args` | No | whispercpp | Custom arguments to pass to whisper-server. The following are NOT allowed: `-m`, `--model`, `--port`. Example: `--convert`. |
 | `steps` | No | sd-cpp | Number of inference steps for image generation. Default: 20. |

diff --git a/src/cpp/server/backends/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp_server.cpp
@@ -158,6 +158,7 @@ void LlamaCppServer::load(const std::string& model_name,
     LOG(DEBUG, "LlamaCpp") << "Per-model settings: " << options.to_log_string() << std::endl;
 
     int ctx_size = options.get_option("ctx_size");
+    std::string llamacpp_device = options.get_option("llamacpp_device");
     std::string llamacpp_backend = options.get_option("llamacpp_backend");
     std::string llamacpp_args = options.get_option("llamacpp_args");
 
@@ -200,6 +201,11 @@ void LlamaCppServer::load(const std::string& model_name,
 
     push_arg(args, reserved_flags, "-m", gguf_path, std::vector<std::string>{"--model"});
     push_arg(args, reserved_flags, "--ctx-size", std::to_string(ctx_size), std::vector<std::string>{"-c"});
+    if (llamacpp_device != "") {
+        push_arg(args, reserved_flags, "--device", llamacpp_device, std::vector<std::string>{"-dev"});
+    } else {
+        push_reserved(reserved_flags, "--device", std::vector<std::string>{"-dev"});
+    }
     push_arg(args, reserved_flags, "--port", std::to_string(port_));
     push_arg(args, reserved_flags, "--jinja", std::vector<std::string>{"--no-jinja"});
 

diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp
@@ -9,6 +9,7 @@ using json = nlohmann::json;
 
 static const json DEFAULTS = {
     {"ctx_size", 4096},
+    {"llamacpp_device", ""},
     {"llamacpp_backend", ""},  // Will be overridden dynamically
     {"llamacpp_args", ""},
     {"sd-cpp_backend", ""},  // sd.cpp backend selection (cpu or rocm)
@@ -32,6 +33,12 @@ static const json CLI_OPTIONS = {
         {"envname", "LEMONADE_CTX_SIZE"},
         {"help", "Context size for the model"}
     }},
+    {"--llamacpp-device", {
+        {"option_name", "llamacpp_device"},
+        {"type_name", "DEVICES"},
+        {"envname", "LEMONADE_LLAMACPP_DEVICE"},
+        {"help", "Comma-separated list of accelerator devices to use (e.g. Vulkan0)"}
+    }},
     {"--llamacpp", {
         {"option_name", "llamacpp_backend"},
         {"type_name", "BACKEND"},
@@ -100,7 +107,7 @@ static const json CLI_OPTIONS = {
 
 static std::vector<std::string> get_keys_for_recipe(const std::string& recipe) {
     if (recipe == "llamacpp") {
-        return {"ctx_size", "llamacpp_backend", "llamacpp_args"};
+        return {"ctx_size", "llamacpp_device", "llamacpp_backend", "llamacpp_args"};
     } else if (recipe == "whispercpp") {
         return {"whispercpp_backend", "whispercpp_args"};
     } else if (recipe == "flm") {