ls1intum
diff --git a/‎logos/db/init.sql‎
Lines changed: 9 additions & 0 deletions b/‎logos/db/init.sql‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎logos/db/migrations/027_logosnode_dynamic_deployments.sql‎
Lines changed: 31 additions & 0 deletions b/‎logos/db/migrations/027_logosnode_dynamic_deployments.sql‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎logos/db/migrations/AGENTS.md‎
Lines changed: 1 addition & 0 deletions b/‎logos/db/migrations/AGENTS.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎logos/db/migrations/run_all_migrations.sh‎
Lines changed: 1 addition & 0 deletions b/‎logos/db/migrations/run_all_migrations.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎logos/logos-workernode/logos_worker_node/calibration.py‎
Lines changed: 82 additions & 15 deletions b/‎logos/logos-workernode/logos_worker_node/calibration.py‎
Lines changed: 82 additions & 15 deletions
diff --git a/‎logos/logos-workernode/logos_worker_node/logos_bridge.py‎
Lines changed: 13 additions & 7 deletions b/‎logos/logos-workernode/logos_worker_node/logos_bridge.py‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎logos/logos-workernode/logos_worker_node/main.py‎
Lines changed: 5 additions & 3 deletions b/‎logos/logos-workernode/logos_worker_node/main.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎logos/src/logos/capacity/capacity_planner.py‎
Lines changed: 10 additions & 9 deletions b/‎logos/src/logos/capacity/capacity_planner.py‎
Lines changed: 10 additions & 9 deletions
@@ -25,6 +25,7 @@ DROP TABLE IF EXISTS token_prices CASCADE;
 DROP TABLE IF EXISTS jobs CASCADE;
 DROP TABLE IF EXISTS ollama_provider_snapshots CASCADE;
 DROP TABLE IF EXISTS model_profiles CASCADE;
+DROP TABLE IF EXISTS logosnode_provider_keys CASCADE;
 DROP TABLE IF EXISTS schema_migrations CASCADE;
 
 CREATE TABLE users (
@@ -111,6 +112,14 @@ CREATE TABLE model_api_keys (
     UNIQUE(model_id, provider_id)
 );
 
+-- Per-provider key for logosnode workers (replaces per-model model_api_keys for workers)
+CREATE TABLE logosnode_provider_keys (
+    id SERIAL PRIMARY KEY,
+    provider_id INTEGER NOT NULL REFERENCES providers(id) ON DELETE CASCADE,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(provider_id)
+);
+
 CREATE TABLE profile_model_permissions (
     id SERIAL PRIMARY KEY,
     profile_id INTEGER REFERENCES profiles(id) ON DELETE CASCADE,
 
@@ -0,0 +1,31 @@
+-- Migration 027: Dynamic logosnode deployments
+--
+-- For logosnode (worker-node) providers, models are announced dynamically via
+-- WebSocket capabilities. This migration:
+--
+-- 1. Creates a logosnode_provider_keys table so workernode providers don't need
+--    per-model entries in model_api_keys (they use a single shared key).
+-- 2. Auto-syncing of model_provider rows is handled at the application layer
+--    when capabilities are announced, so existing deployment queries continue
+--    to work without schema changes.
+
+-- Step 1: Create logosnode_provider_keys table
+-- This replaces the need for per-model model_api_keys rows for logosnode providers.
+-- Each logosnode provider has exactly one key (stored in providers.api_key already),
+-- but this table makes the deployment query work without model_api_keys.
+CREATE TABLE IF NOT EXISTS logosnode_provider_keys (
+    id SERIAL PRIMARY KEY,
+    provider_id INTEGER NOT NULL REFERENCES providers(id) ON DELETE CASCADE,
+    created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(provider_id)
+);
+
+-- Seed the table from existing logosnode providers
+INSERT INTO logosnode_provider_keys (provider_id)
+SELECT id FROM providers WHERE provider_type = 'logosnode'
+ON CONFLICT (provider_id) DO NOTHING;
+
+-- Record migration
+INSERT INTO schema_migrations (filename)
+VALUES ('027_logosnode_dynamic_deployments.sql')
+ON CONFLICT (filename) DO NOTHING;
@@ -0,0 +1 @@
+If you create a migration in this folder, ensure it is also added to "run_all_migration.sh"!
@@ -61,6 +61,7 @@ MIGRATIONS=(
     "024_store_logosnode_runtime_payload.sql"
     "025_create_model_profiles_table.sql"
     "026_create_schema_migrations.sql"
+    "027_logosnode_dynamic_deployments.sql"
 )
 
 FAILED=0
 
@@ -60,6 +60,7 @@
 _CALIBRATION_PORT = 11499
 _KV_CACHE_MIN_STEP_MB = 1024.0  # binary search precision and safety margin
 _KV_CACHE_VRAM_CAP_RATIO = 0.8  # fraction of total GPU VRAM used as KV search ceiling
+_FAILED_COMMANDS_FILE = "calibration_failed_commands.txt"
 
 # ---------------------------------------------------------------------------
 # KV-cache size parsing
@@ -96,6 +97,47 @@ def _round_up_gb(mb: float) -> float:
     return math.ceil(mb / 1024.0) * 1024.0
 
 
+# ---------------------------------------------------------------------------
+# Failed-command blacklist
+# ---------------------------------------------------------------------------
+
+
+def _cmd_fingerprint(cmd: list[str]) -> str:
+    """Build a canonical one-line string from a vLLM command list.
+
+    Strips ``--host`` and ``--port`` (calibration infra, not model-specific)
+    so that retries on a different port aren't falsely considered "new".
+    """
+    filtered: list[str] = []
+    skip_next = False
+    for i, tok in enumerate(cmd):
+        if skip_next:
+            skip_next = False
+            continue
+        if tok in ("--host", "--port"):
+            skip_next = True
+            continue
+        filtered.append(tok)
+    return " ".join(filtered)
+
+
+def _load_failed_commands(failed_path: Path) -> set[str]:
+    if not failed_path.exists():
+        return set()
+    return {
+        line.strip()
+        for line in failed_path.read_text(encoding="utf-8").splitlines()
+        if line.strip() and not line.strip().startswith("#")
+    }
+
+
+def _record_failed_command(failed_path: Path, fingerprint: str) -> None:
+    failed_path.parent.mkdir(parents=True, exist_ok=True)
+    with failed_path.open("a", encoding="utf-8") as f:
+        f.write(fingerprint + "\n")
+    logger.info("  Blacklisted command → %s", failed_path)
+
+
 # ---------------------------------------------------------------------------
 # GPU VRAM helpers
 # ---------------------------------------------------------------------------
@@ -198,30 +240,23 @@ def _post(
 # ---------------------------------------------------------------------------
 
 
-def spawn_vllm(
+def _build_vllm_cmd(
     plan: dict[str, Any],
     vllm_binary: str,
     host: str,
     port: int,
-    log_path: Path,
     kv_cache_memory_bytes: str,
-) -> subprocess.Popen[str]:
+) -> list[str]:
+    """Build the vLLM command list without spawning a process."""
     model = plan["model"]
     tp = int(plan.get("tensor_parallel_size", 1))
     dtype = str(plan.get("dtype", "auto"))
     quant = str(plan.get("quantization") or "")
     max_model_len = plan.get("max_model_len")
     enforce_eager = bool(plan.get("enforce_eager", False))
     disable_custom_all_reduce = bool(plan.get("disable_custom_all_reduce", False))
-    disable_nccl_p2p = bool(plan.get("disable_nccl_p2p", False))
     extra_args: list[str] = list(plan.get("extra_args") or [])
     kv_bytes = str(plan.get("kv_cache_memory_bytes") or kv_cache_memory_bytes)
-
-    # When kv_cache_memory_bytes is set, omit --gpu-memory-utilization and let
-    # vLLM default to 0.9. kv_cache_memory_bytes controls the KV pool size
-    # directly; adding gpu_memory_utilization=0.1 caps total VRAM to 10% which
-    # prevents the model weights from loading at all.
-    # An explicit per-model override takes precedence.
     explicit_gmu = plan.get("gpu_memory_utilization")
 
     cmd = [
@@ -250,8 +285,23 @@ def spawn_vllm(
         cmd.append("--enforce-eager")
     if disable_custom_all_reduce:
         cmd.append("--disable-custom-all-reduce")
-    # disable_nccl_p2p is applied via NCCL_P2P_DISABLE env var below (not a vLLM CLI flag)
     cmd.extend(extra_args)
+    return cmd
+
+
+def spawn_vllm(
+    plan: dict[str, Any],
+    vllm_binary: str,
+    host: str,
+    port: int,
+    log_path: Path,
+    kv_cache_memory_bytes: str,
+) -> tuple[subprocess.Popen[str], list[str]]:
+    """Spawn vLLM and return ``(process, cmd_list)``."""
+    tp = int(plan.get("tensor_parallel_size", 1))
+    disable_nccl_p2p = bool(plan.get("disable_nccl_p2p", False))
+
+    cmd = _build_vllm_cmd(plan, vllm_binary, host, port, kv_cache_memory_bytes)
 
     env = os.environ.copy()
     env["VLLM_SERVER_DEV_MODE"] = "1"
@@ -284,7 +334,7 @@ def spawn_vllm(
 
     logger.info("  Spawned PID=%d  log=%s", proc.pid, log_path)
     logger.info("  Command: %s", " ".join(cmd))
-    return proc
+    return proc, cmd
 
 
 def _kill_stale_vllm_workers() -> None:
@@ -554,13 +604,28 @@ def calibrate_model(
             kv_cache_sent_mb, _KV_CACHE_MIN_STEP_MB,
         )
 
+    failed_path = log_dir / _FAILED_COMMANDS_FILE
+    failed_commands = _load_failed_commands(failed_path)
+
     def _try_start(kv_mb: float) -> subprocess.Popen[str] | None:
         """Try to start vLLM with the given KV cache.  Returns the
         running process on success, ``None`` on failure (process is
-        cleaned up)."""
+        cleaned up).  Blacklisted commands are skipped immediately."""
         kv_str = _format_kv_mb(kv_mb)
-        proc = spawn_vllm(
-            {**plan, "kv_cache_memory_bytes": kv_str},
+        planned = {**plan, "kv_cache_memory_bytes": kv_str}
+        # Check the blacklist *before* spawning to avoid wasting time.
+        fingerprint = _cmd_fingerprint(
+            _build_vllm_cmd(planned, vllm_binary, host, port, kv_str)
+        )
+        if fingerprint in failed_commands:
+            logger.warning(
+                "        SKIP kv_cache=%s — command previously failed "
+                "(remove line from %s to retry)",
+                kv_str, failed_path,
+            )
+            return None
+        proc, _ = spawn_vllm(
+            planned,
             vllm_binary, host, port, log_path,
             kv_cache_memory_bytes=kv_str,
         )
@@ -585,6 +650,8 @@ def _try_start(kv_mb: float) -> subprocess.Popen[str] | None:
                 logger.warning("  -- vLLM log tail --\n%s", log_tail)
             stop_vllm(proc)
             time.sleep(_VRAM_SETTLE_S)
+            _record_failed_command(failed_path, fingerprint)
+            failed_commands.add(fingerprint)
             return None
 
     proc: subprocess.Popen[str] | None = None
 
@@ -490,11 +490,15 @@ async def _execute_command(self, action: str, params: dict[str, Any]) -> dict[st
         raise ValueError(f"Unsupported bridge command '{action}'")
 
     @staticmethod
-    def _lane_target_url(lane_status: dict[str, Any], payload: dict[str, Any] | None = None) -> str:
-        # Detect embeddings requests from payload shape: has "input" but not "messages".
-        # vLLM exposes embeddings at /v1/embeddings, not /v1/chat/completions.
-        if payload and "input" in payload and "messages" not in payload:
-            endpoint = "v1/embeddings"
+    def _lane_target_url(
+        lane_status: dict[str, Any],
+        payload: dict[str, Any] | None = None,
+        request_path: str | None = None,
+    ) -> str:
+        # If the caller forwarded the original API path (e.g. "v1/embeddings",
+        # "v2/embed", "tokenize"), use it directly so vLLM decides what it supports.
+        if request_path:
+            endpoint = request_path.strip("/")
         else:
             endpoint = str(lane_status.get("inference_endpoint") or "/v1/chat/completions").lstrip("/")
         return f"http://127.0.0.1:{lane_status['port']}/{endpoint}"
@@ -515,7 +519,8 @@ async def _execute_infer_command(self, params: dict[str, Any]) -> dict[str, Any]
             raise ValueError("payload must be an object")
 
         lane_status = await self._resolve_lane_for_infer(lane_id)
-        target_url = self._lane_target_url(lane_status, payload)
+        request_path = params.get("request_path")
+        target_url = self._lane_target_url(lane_status, payload, request_path=request_path)
 
         await lane_manager.increment_active_requests(lane_id)
         try:
@@ -547,7 +552,8 @@ async def _execute_stream_command(self, ws, cmd_id: str, params: dict[str, Any])
 
         try:
             lane_status = await self._resolve_lane_for_infer(lane_id)
-            target_url = self._lane_target_url(lane_status, payload)
+            request_path = params.get("request_path")
+            target_url = self._lane_target_url(lane_status, payload, request_path=request_path)
         except Exception as exc:  # noqa: BLE001
             await self._send_json(ws, {"type": "stream_end", "cmd_id": cmd_id, "success": False, "error": str(exc)})
             return
 
@@ -58,12 +58,14 @@ async def _auto_calibrate_if_needed(
         elif profile.sleeping_residual_mb is None:
             reason = "sleeping_residual_mb is null"
         elif (
-            profile.residency_source in ("calibrated", "measured")
+            profile.residency_source == "calibrated"
             and profile.loaded_vram_mb is not None
             and abs(profile.base_residency_mb - profile.loaded_vram_mb) > 1.0
         ):
-            # Old-format profile: base_residency was stored as weights-only.
-            # New format stores full loaded VRAM. Force recalibration.
+            # Old-format calibrated profile: base_residency was stored as
+            # weights-only. New format stores full loaded VRAM. Force recalibration.
+            # Note: "measured" profiles intentionally differ (base=weights-only,
+            # loaded=weights+KV) and must NOT be flagged as stale.
             reason = f"stale format (base={profile.base_residency_mb:.0f} != loaded={profile.loaded_vram_mb:.0f})"
         if reason:
             logger.info("  %s needs calibration: %s", model_name, reason)
 
@@ -289,7 +289,7 @@ def _log_cluster_summary(self, provider_ids: List[int]) -> None:
             snap = self._registry.peek_runtime_snapshot(pid) if self._registry else None
             if snap is None:
                 name = self._facade.get_provider_name(pid) or "?"
-                lines.append(f"{paint('⊘', RED)} provider={pid} worker={paint(name, BOLD)} {paint('offline', DIM)}")
+                lines.append(f"{paint('⊘', RED)} provider={paint(name, BOLD)} {paint('offline', DIM)}")
                 continue
 
             connected += 1
@@ -311,7 +311,7 @@ def _log_cluster_summary(self, provider_ids: List[int]) -> None:
             worker_color = GREEN if heartbeat_age_s <= 15 else YELLOW if heartbeat_age_s <= 30 else RED
 
             lines.append(
-                f"{paint('●', worker_color)} provider={pid} worker={paint(str(worker_id), BOLD)} "
+                f"{paint('●', worker_color)} provider={paint(str(worker_id), BOLD)} "
                 f"status={paint('active', worker_color)} hb={heartbeat_age_s:.0f}s "
                 f"vram={paint(f'{total_vram - free_vram:.0f}/{total_vram:.0f}MB', BOLD)} ({used_pct:.0f}%)"
             )
@@ -442,9 +442,10 @@ def _log_action_plan(self, actions: list[CapacityPlanAction]) -> None:
         }
         for action in actions:
             color = action_colors.get(action.action, CYAN)
+            pname = self._facade.get_provider_name(action.provider_id) or str(action.provider_id)
             lines.append(
                 f"{paint('→', color)} {paint(action.action, color, BOLD)} "
-                f"provider={action.provider_id} lane={action.lane_id}"
+                f"provider={pname} lane={action.lane_id}"
             )
             lines.extend(wrap_plain(f"model: {action.model_name}", indent="    "))
             lines.extend(wrap_plain(f"reason: {action.reason}", indent="    "))
@@ -565,7 +566,7 @@ async def _cold_load_for_request(
         profile = self._safe_get_profiles(provider_id).get(model_name)
         capacity = self._safe_get_capacity(provider_id)
         if capacity is None:
-            logger.debug("No capacity info for provider %s, cannot cold-load %s", provider_id, model_name)
+            logger.debug("No capacity info for provider %s, cannot cold-load %s", self._facade.get_provider_name(provider_id) or provider_id, model_name)
             return None
 
         # No early feasibility bail-out here — the reclaim loop below will
@@ -662,11 +663,11 @@ async def _cold_load_for_request(
             if not ok:
                 logger.info(
                     "Cannot reclaim enough VRAM for cold load of %s on provider %s",
-                    model_name, provider_id,
+                    model_name, self._facade.get_provider_name(provider_id) or provider_id,
                 )
                 return None
 
-        logger.info("Cold-loading %s on provider %s (lane=%s)", model_name, provider_id, lane_id)
+        logger.info("Cold-loading %s on provider %s (lane=%s)", model_name, self._facade.get_provider_name(provider_id) or provider_id, lane_id)
         async with self._lane_lock(provider_id, lane_id):
             loaded = await self._execute_action_with_confirmation(
                 load_action, timeout_seconds=max(timeout_seconds, 180.0),
@@ -1725,12 +1726,12 @@ def _compute_preemptive_sleep_actions(
         if candidates:
             logger.info(
                 "Preemptive sleep candidates for provider=%s: %s",
-                provider_id,
+                self._facade.get_provider_name(provider_id) or provider_id,
                 ", ".join(f"{name}(demand={score:.2f}, residual={profile.sleeping_residual_mb:.0f}MB)"
                           for score, name, profile in candidates),
             )
         else:
-            logger.info("Preemptive sleep: no candidates for provider=%s (no stopped models with known residual and demand>0)", provider_id)
+            logger.info("Preemptive sleep: no candidates for provider=%s (no stopped models with known residual and demand>0)", self._facade.get_provider_name(provider_id) or provider_id)
 
         now = time.time()
 
@@ -2779,7 +2780,7 @@ def _validate_vram_budget(
                     - self.get_pending_vram_mb(provider_id)
                 )
             except Exception:
-                logger.debug("Cannot check VRAM for provider %s, rejecting %s", provider_id, action.action)
+                logger.debug("Cannot check VRAM for provider %s, rejecting %s", self._facade.get_provider_name(provider_id) or provider_id, action.action)
                 continue
 
             try:
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+If you create a migration in this folder, ensure it is also added to "run_all_migration.sh"!`
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ MIGRATIONS=(`
`61`	`61`	`"024_store_logosnode_runtime_payload.sql"`
`62`	`62`	`"025_create_model_profiles_table.sql"`
`63`	`63`	`"026_create_schema_migrations.sql"`
	`64`	`+ "027_logosnode_dynamic_deployments.sql"`
`64`	`65`	`)`
`65`	`66`
`66`	`67`	`FAILED=0`