NVIDIA · jcfortune · Jan 18, 2026 · Jan 19, 2026 · Jan 19, 2026 · Jan 20, 2026
diff --git a/.project/docker-compose.yaml b/.project/docker-compose.yaml
@@ -0,0 +1,25 @@
+services:
+  llm-nim:
+    image: nvcr.io/nim/meta/llama-3.1-8b-instruct:latest
+    environment:
+      - NGC_API_KEY=${NGC_API_KEY}
+      - NIM_DISABLE_CUDA_GRAPH=1
+      - NIM_VLLM_FLAGS=--enforce-eager
+      - NIM_MAX_MODEL_LEN=16384
+    ports:
+      - "8000:8000"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+  embed-nim:
+    image: nvcr.io/nim/nvidia/llama-3.2-nemoretriever-300m-embed-v1:latest
+    environment:
+      - NGC_API_KEY=${NGC_API_KEY}
+      - NIM_DISABLE_CUDA_GRAPH=1
+    ports:
+      - "8001:8000"
diff --git a/.project/spec.yaml b/.project/spec.yaml
@@ -1,3 +1,4 @@
+# spec.yaml
 specVersion: v2
 specMinorVersion: 2
 meta:
@@ -34,6 +35,20 @@ environment:
         entrypoint_script: ""
         labels:
             - cuda12.2
+#environment:
+    # ... keep your existing base image settings ...
+    variables:
+        - name: NVIDIA_LLM_URL
+          value: "http://host.docker.internal:8000/v1"
+        - name: NVIDIA_EMBED_URL
+          value: "http://host.docker.internal:8001/v1"
+        - name: NIM_MAX_MODEL_LEN
+          value: "16384"
+# Since you're on Spark, ensure the key is passed for local auth
+        - name: NGC_API_KEY
+          value: " "          
+    # This points AI Workbench to your docker-compose file for sidecar NIMs
+    compose_file_path: "docker-compose.yaml"
         apps:
             - name: jupyterlab
               type: jupyterlab
@@ -150,10 +165,14 @@ execution:
             proxy:
                 trim_prefix: false
             url: http://localhost:8501
+#    resources:
+#        gpu:
+#            requested: 0
+#        sharedMemoryMB: 1024
     resources:
         gpu:
-            requested: 0
-        sharedMemoryMB: 1024
+            requested: 1
+        sharedMemoryMB: 16384  # Increased for Spark's unified memory usage        
     secrets:
         - variable: NGC_API_KEY
           description: NGC Personal Key from https://org.ngc.nvidia.com/setup/personal-keys

diff --git a/code/chain_server/server.py b/code/chain_server/server.py
@@ -13,6 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Use the environment variables we set in spec.yaml
+llm = ChatNVIDIA(
+    base_url=os.getenv("NVIDIA_LLM_URL"),
+    model=os.getenv("LLM_MODEL_NAME"),
+    temperature=0.1
+)
+
+embedder = NVIDIAEmbeddings(
+    base_url=os.getenv("NVIDIA_EMBED_URL"),
+    model=os.getenv("EMBEDDING_MODEL_NAME")
+)
+
 """The definition of the NVIDIA Conversational RAG API server."""
 
 import os

diff --git a/code/upload-pdfs.ipynb b/code/upload-pdfs.ipynb
diff --git a/compose.yaml b/compose.yaml
@@ -116,4 +116,3 @@ volumes:
   milvus:
   redis:
   nim-cache:
-
diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ redis==5.2.1
 sse-starlette==2.2.1
 uvicorn==0.34.0
 watchfiles==1.0.3
+pydantic==2.10.6
diff --git a/variables.env b/variables.env
@@ -7,6 +7,7 @@ APP_MILVUS__URL=http://milvus:19530
 APP_REDIS_DSN=redis://redis:6379/0
 NGC_CLI_ORG=default
 NGC_HOME=~/.cache/nvidia-nims
+NGC_API_KEY=""
 
 LLM_NIM_0_MODEL=meta/llama3-8b-instruct
 LLM_NIM_0_NIM_VERSION=1.0.0
@@ -24,3 +25,10 @@ NV_EMBEDQA_E5_V5_NIM_VERSION=1.0.1
 NV_EMBEDQA_E5_V5_NIM_GPUS=all
 
 SHELL=/bin/bash
+
+# Hardcode these to ensure the Python app ignores the cloud defaults
+NVIDIA_LLM_URL=http://host.docker.internal:8000/v1
+NVIDIA_EMBED_URL=http://host.docker.internal:8001/v1
+LLM_MODEL_NAME=meta/llama-3.1-8b-instruct
+EMBEDDING_MODEL_NAME=nvidia/llama-3.2-nemoretriever-300m-embed-v1
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -116,4 +116,3 @@ volumes:
		milvus:
		redis:
		nim-cache: