feat: enable dynamic LoRA adapter loading on gpt-oss-120b

felixboelter · felixboelter · commit 70aa5724b950 · 2026-05-08T09:57:47.000+02:00
Adds --enable-lora with max-loras=4, max-lora-rank=64, plus
VLLM_ALLOW_RUNTIME_LORA_UPDATING=true and a /adapters mount
backed by a new gpt-oss-120b-adapters PVC (50Gi RWX nfs-csi).
max-loras=4 fits comfortably on the H100 (80GB).

Pins the vLLM image to the same digest as the ministral commit:
sha256:04563c302537a91aa49ebdfbceda96111c5712275999b7e8804fa598f0b5641d

DO NOT APPLY without first running Phase 0.3 from the rollout
plan: gpt-oss-120b is MoE; vLLM LoRA support for MoE has been
incremental and must be verified against the pinned digest before
this lands in prod. Acceptable evidence is either a successful
runtime /v1/load_lora_adapter against a small public adapter or
a release-note confirmation for the pinned vLLM build.

If MoE+LoRA turns out to be unsupported, revert just this commit;
the ministral rollout and shared infra (template, docs, litellm
configmap) stay intact.
diff --git a/models/gpt-oss-120b/deployment.yaml b/models/gpt-oss-120b/deployment.yaml
@@ -18,7 +18,7 @@ spec:
         accelerator: h100
       containers:
         - name: vllm
-          image: vllm/vllm-openai:latest-cu130
+          image: vllm/vllm-openai@sha256:04563c302537a91aa49ebdfbceda96111c5712275999b7e8804fa598f0b5641d
           args:
             - "--model"
             - "openai/gpt-oss-120b"
@@ -29,10 +29,19 @@ spec:
             - "--tensor-parallel-size"
             - "1"
             - "--tool-call-parser"
-            - "openai" 
+            - "openai"
             - "--enable-auto-tool-choice"
             - "--gpu-memory-utilization"
             - "0.90"
+            - "--enable-lora"
+            - "--max-loras"
+            - "4"
+            - "--max-lora-rank"
+            - "64"
+            # Preload static adapters by uncommenting and listing name=path pairs.
+            # Paths may be /adapters/<dir> (PVC) or HF repo IDs (e.g. user/my-lora).
+            # - "--lora-modules"
+            # - "my-adapter=/adapters/my-adapter"
             - "--port"
             - "8000"
           ports:
@@ -45,11 +54,15 @@ spec:
                 secretKeyRef:
                   name: litellm-secret
                   key: HF_TOKEN
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "true"
           volumeMounts:
             - name: model-cache
               mountPath: /root/.cache/huggingface
             - name: vllm-config
               mountPath: /etc/vllm
+            - name: adapters
+              mountPath: /adapters
           resources:
             limits:
               nvidia.com/gpu: "1"
@@ -66,3 +79,6 @@ spec:
         - name: vllm-config
           configMap:
             name: gpt-oss-120b-config
+        - name: adapters
+          persistentVolumeClaim:
+            claimName: gpt-oss-120b-adapters
diff --git a/models/gpt-oss-120b/pvc.yaml b/models/gpt-oss-120b/pvc.yaml
@@ -9,3 +9,15 @@ spec:
   resources:
     requests:
       storage: 200Gi
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: gpt-oss-120b-adapters
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: nfs-csi
+  resources:
+    requests:
+      storage: 50Gi