Logos: Skip calibration retry for unsupported architectures and upgrade transformers

wasnertobias · claude · wasnertobias · commit 4e5e7376786c · 2026-04-08T09:18:16.000+02:00
When vLLM fails with "does not recognize this architecture", skip TP
escalation retries since more GPUs cannot fix an unsupported model type.
Also upgrade transformers in the Dockerfile to support newer architectures
like gemma4 out of the box.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/logos/logos-workernode/Dockerfile b/logos/logos-workernode/Dockerfile
@@ -147,7 +147,9 @@ RUN set -e \
          && if [ -n "$NCCL_PIP_SPEC" ]; then \
               pip install --no-cache-dir --force-reinstall --no-deps "$NCCL_PIP_SPEC"; \
             fi \
-         # 4. FlashInfer (install AFTER torch so it picks up the correct version)
+         # 4. Upgrade transformers for latest model architecture support
+         && pip install --no-cache-dir --upgrade transformers \
+         # 5. FlashInfer (install AFTER torch so it picks up the correct version)
          && pip install --no-cache-dir --no-deps flashinfer-python; \
        fi
 
diff --git a/logos/logos-workernode/logos_worker_node/calibration.py b/logos/logos-workernode/logos_worker_node/calibration.py
@@ -500,9 +500,12 @@ def calibrate_model(
         stop_vllm(proc)
         time.sleep(_VRAM_SETTLE_S)
 
+        error_detail = str(exc)
+        if log_tail:
+            error_detail = f"{error_detail}\n{log_tail}"
         partial.error = (
             f"Model failed to start with KV cache {kv_bytes_str} on "
-            f"tp={tp}: {exc}"
+            f"tp={tp}: {error_detail}"
         )
         logger.warning("  %s", partial.error)
         return partial
@@ -862,7 +865,10 @@ def auto_calibrate_models(
         result = _try_calibrate(plan, **cal_kwargs)
 
         # tp escalation: if calibration failed, retry with doubled tp
-        while not result.success and tp * 2 <= max_tp:
+        # Skip escalation for errors that cannot be solved by more GPUs
+        # (e.g. unsupported model architecture).
+        _fatal = "does not recognize this architecture" in (result.error or "")
+        while not result.success and not _fatal and tp * 2 <= max_tp:
             next_tp = tp * 2
             logger.info(
                 "  %s failed with tp=%d — retrying with tp=%d",