feat: add nvidia gpu support (#86)

giovtorres · web-flow · commit f8676637ec38 · 2026-02-24T20:42:06.000-05:00
diff --git a/.env.example b/.env.example
@@ -24,7 +24,15 @@ MYSQL_DATABASE=slurm_acct_db
 # Leave commented for default file-based job completion logging
 #ELASTICSEARCH_HOST=http://elasticsearch:9200
 
-# sshd options
+# SSHD options
 SSH_ENABLE=false
 SSH_AUTHORIZED_KEYS=$HOME/.ssh/authorized_keys
 SSH_PORT=3022
+
+# Uncomment to enable NVIDIA GPU support with dedicated GPU node 'g1' (OPTIONAL)
+# Host requirement: nvidia-container-toolkit (one-time install)
+# CUDA toolkit is installed in the container (no host CUDA needed)
+# When enabled, 'make up' automatically starts GPU node with GRES configuration
+#GPU_ENABLE=true
+#GPU_COUNT=1
+#CUDA_VERSION=12.6
diff --git a/Dockerfile b/Dockerfile
@@ -19,7 +19,7 @@ RUN set -ex \
     && dnf -y install dnf-plugins-core epel-release \
     && dnf config-manager --set-enabled crb \
     && dnf makecache \
-    && dnf -y install \
+    && dnf -y install --nobest --exclude='*.i686' \
        autoconf \
        automake \
        bzip2 \
@@ -92,6 +92,8 @@ LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docke
 
 ARG SLURM_VERSION
 ARG TARGETARCH
+ARG GPU_ENABLE
+ARG CUDA_VERSION
 
 # Enable CRB and EPEL repositories, then install runtime dependencies
 RUN set -ex \
@@ -152,6 +154,24 @@ RUN set -ex \
     && gosu --version \
     && gosu nobody true
 
+# Conditionally install CUDA toolkit for GPU support
+RUN if [ "$GPU_ENABLE" = "true" ]; then \
+      set -ex && \
+      echo "Installing CUDA ${CUDA_VERSION} runtime for GPU support..." && \
+      dnf config-manager --add-repo \
+        https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo && \
+      dnf -y install \
+        cuda-nvml-devel-$(echo ${CUDA_VERSION} | tr '.' '-') \
+        cuda-cudart-$(echo ${CUDA_VERSION} | tr '.' '-') \
+        cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') \
+        nvidia-driver-cuda-libs && \
+      dnf clean all && \
+      rm -rf /var/cache/dnf && \
+      echo "CUDA ${CUDA_VERSION} installed successfully"; \
+    else \
+      echo "GPU support disabled, skipping CUDA installation"; \
+    fi
+
 COPY --from=builder /root/rpmbuild/RPMS/*/*.rpm /tmp/rpms/
 
 # Install Slurm RPMs
@@ -208,6 +228,14 @@ RUN set -ex \
          echo "Using common cgroup.conf"; \
          cp /tmp/slurm-config/common/cgroup.conf /etc/slurm/cgroup.conf; \
        fi \
+    && if [ "$GPU_ENABLE" = "true" ]; then \
+         echo "GPU support enabled, installing gres.conf"; \
+         cp /tmp/slurm-config/common/gres.conf /etc/slurm/gres.conf; \
+         chown slurm:slurm /etc/slurm/gres.conf; \
+         chmod 644 /etc/slurm/gres.conf; \
+       else \
+         echo "GPU support disabled, skipping gres.conf"; \
+       fi \
     && chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf /etc/slurm/slurmdbd.conf \
     && chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf \
     && chmod 600 /etc/slurm/slurmdbd.conf \
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help build build-no-cache up start down clean logs test test-monitoring status shell logs-slurmctld logs-slurmdbd update-slurm reload-slurm version set-version build-all test-all test-version rebuild jobs quick-test run-examples
+.PHONY: help build build-no-cache up start down clean logs test test-monitoring test-gpu status shell logs-slurmctld logs-slurmdbd update-slurm reload-slurm version set-version build-all test-all test-version rebuild jobs quick-test run-examples
 
 # Default target
 .DEFAULT_GOAL := help
@@ -8,10 +8,19 @@ SUPPORTED_VERSIONS := 24.11.7 25.05.6 25.11.2
 # Read default version from .env.example (source of truth)
 DEFAULT_VERSION := $(shell grep '^SLURM_VERSION=' .env.example | cut -d= -f2)
 
-# Auto-detect monitoring profile based on .env configuration
-# If ELASTICSEARCH_HOST is set, automatically enable monitoring profile
+# Auto-detect profiles based on .env configuration
 ELASTICSEARCH_HOST := $(shell grep -E '^ELASTICSEARCH_HOST=' .env 2>/dev/null | cut -d= -f2)
-PROFILE_FLAG := $(if $(ELASTICSEARCH_HOST),--profile monitoring,)
+GPU_ENABLE := $(shell grep -E '^GPU_ENABLE=' .env 2>/dev/null | cut -d= -f2)
+
+# Build profile flags
+PROFILES :=
+ifdef ELASTICSEARCH_HOST
+    PROFILES += --profile monitoring
+endif
+ifeq ($(GPU_ENABLE),true)
+    PROFILES += --profile gpu
+endif
+PROFILE_FLAG := $(PROFILES)
 
 # Colors for help output
 CYAN := $(shell tput -Txterm setaf 6)
@@ -44,6 +53,7 @@ help:  ## Show this help message
 	@printf "  ${CYAN}%-15s${RESET} %s\n" "shell" "Open shell in slurmctld"
 	@printf "  ${CYAN}%-15s${RESET} %s\n" "test" "Run test suite"
 	@printf "  ${CYAN}%-15s${RESET} %s\n" "test-monitoring" "Run monitoring profile tests"
+	@printf "  ${CYAN}%-15s${RESET} %s\n" "test-gpu" "Run GPU profile tests"
 	@printf "  ${CYAN}%-15s${RESET} %s\n" "quick-test" "Submit a quick test job"
 	@printf "  ${CYAN}%-15s${RESET} %s\n" "run-examples" "Run example jobs"
 	@echo ""
@@ -62,6 +72,10 @@ help:  ## Show this help message
 	@echo "Monitoring:"
 	@echo "  Enable:  Set ELASTICSEARCH_HOST=http://elasticsearch:9200 in .env"
 	@echo "  Disable: Comment out or remove ELASTICSEARCH_HOST from .env"
+	@echo ""
+	@echo "GPU Support (NVIDIA):"
+	@echo "  Enable:  Set GPU_ENABLE=true in .env (requires nvidia-container-toolkit on host)"
+	@echo "  Disable: Set GPU_ENABLE=false or remove GPU_ENABLE from .env"
 
 build:  ## Build Docker images
 	docker compose --progress plain build
@@ -87,6 +101,9 @@ test:  ## Run test suite
 test-monitoring:  ## Run monitoring profile test suite
 	./test_monitoring.sh
 
+test-gpu:  ## Run GPU profile test suite
+	./test_gpu.sh
+
 status:  ## Show cluster status
 	@echo "=== Containers ==="
 	@docker compose ps
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ make help
 
 ```bash
 cp .env.example .env
-# Edit .env to change SLURM_VERSION or enable ELASTICSEARCH_HOST
+# Edit .env to change SLURM_VERSION, enable ELASTICSEARCH_HOST, or enable GPU_ENABLE
 make up
 ```
 
@@ -45,6 +45,7 @@ make up
 - **slurmctld** - Controller for job scheduling
 - **slurmrestd** - REST API daemon (HTTP/JSON access)
 - **c1, c2** - Compute nodes
+- **g1** - (optional) GPU compute node with NVIDIA support
 - **elasticsearch** - (optional) indexing jobs
 - **kibana** - (optional) visualization for elasticsearch
 
@@ -114,6 +115,34 @@ make test-monitoring
 
 **Indexed data:** Job ID, user, partition, state, times, nodes, exit code
 
+## 🎮 GPU Support (NVIDIA)
+
+Enable optional NVIDIA GPU support with dedicated GPU node:
+
+```bash
+# 1. One-time host setup (add NVIDIA repo and install nvidia-container-toolkit)
+curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
+  | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
+sudo dnf install -y nvidia-container-toolkit
+sudo nvidia-ctk runtime configure --runtime=docker
+sudo systemctl restart docker
+
+# 2. Enable GPU in .env (CUDA toolkit installed in container automatically)
+GPU_ENABLE=true
+CUDA_VERSION=12.6  # Optional, defaults to 12.6
+
+# 3. Build with GPU support
+make rebuild
+
+# 4. Verify GPU detection
+docker exec g1 nvidia-smi
+
+# Test GPU functionality
+make test-gpu
+```
+
+> **Note:** GPU testing is not included in CI (GitHub-hosted runners have no GPUs). Run `make test-gpu` manually on a host with an NVIDIA GPU and `nvidia-container-toolkit` installed.
+
 ## 🔄 Cluster Management
 
 ```bash
diff --git a/config/24.11/slurm.conf b/config/24.11/slurm.conf
@@ -15,7 +15,7 @@ SlurmctldHost=slurmctld
 #EpilogSlurmctld=
 #FirstJobId=1
 #MaxJobId=67043328
-#GresTypes=
+GresTypes=gpu
 #GroupUpdateForce=0
 #GroupUpdateTime=600
 #JobFileAppend=0
@@ -42,6 +42,7 @@ ProctrackType=proctrack/linuxproc
 #PropagateResourceLimitsExcept=
 #RebootProgram=
 ReturnToService=1
+SlurmdParameters=config_overrides
 SlurmctldPidFile=/var/run/slurm/slurmctld.pid
 SlurmctldPort=6817
 SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -151,8 +152,11 @@ AuthType=auth/munge
 # COMPUTE NODES
 NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
 NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
+# GPU node (only available when GPU_ENABLE=true in .env)
+NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
 
 PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
+PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
 
 # add JWT as an alternative authentication type
 AuthAltTypes=auth/jwt
diff --git a/config/25.05/slurm.conf b/config/25.05/slurm.conf
@@ -14,7 +14,7 @@ SlurmctldHost=slurmctld
 #EpilogSlurmctld=
 #FirstJobId=1
 #MaxJobId=67043328
-#GresTypes=
+GresTypes=gpu
 #GroupUpdateForce=0
 #GroupUpdateTime=600
 #JobFileAppend=0
@@ -41,6 +41,7 @@ ProctrackType=proctrack/linuxproc
 #PropagateResourceLimitsExcept=
 #RebootProgram=
 ReturnToService=1
+SlurmdParameters=config_overrides
 SlurmctldPidFile=/var/run/slurm/slurmctld.pid
 SlurmctldPort=6817
 SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -150,8 +151,11 @@ AuthType=auth/munge
 # COMPUTE NODES
 NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
 NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
+# GPU node (only available when GPU_ENABLE=true in .env)
+NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
 
 PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
+PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
 
 # add JWT as an alternative authentication type
 AuthAltTypes=auth/jwt
diff --git a/config/25.11/slurm.conf b/config/25.11/slurm.conf
@@ -14,7 +14,7 @@ SlurmctldHost=slurmctld
 #EpilogSlurmctld=
 #FirstJobId=1
 #MaxJobId=67043328
-#GresTypes=
+GresTypes=gpu
 #GroupUpdateForce=0
 #GroupUpdateTime=600
 #JobFileAppend=0
@@ -41,6 +41,7 @@ ProctrackType=proctrack/linuxproc
 #PropagateResourceLimitsExcept=
 #RebootProgram=
 ReturnToService=1
+SlurmdParameters=config_overrides
 SlurmctldPidFile=/var/run/slurm/slurmctld.pid
 SlurmctldPort=6817
 SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -150,9 +151,12 @@ AuthType=auth/munge
 # COMPUTE NODES
 NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
 NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
+# GPU node (only available when GPU_ENABLE=true in .env)
+NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
 
 PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
+PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
 
 # add JWT as an alternative authentication type
 AuthAltTypes=auth/jwt
-AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key
+AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key
diff --git a/config/common/gres.conf b/config/common/gres.conf
@@ -0,0 +1,8 @@
+# gres.conf - Generic Resource (GRES) configuration
+# Defines GPU resources available on compute nodes
+#
+# This file is only used when GPU_ENABLE=true in .env
+# GPU node 'g1' is configured with NVIDIA GPUs
+
+# GPU node configuration
+NodeName=g1 Name=gpu Type=nvidia File=/dev/nvidia[0-9] AutoDetect=nvml
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -25,6 +25,8 @@ services:
       context: .
       args:
         SLURM_VERSION: ${SLURM_VERSION:-25.11.2}
+        GPU_ENABLE: ${GPU_ENABLE:-false}
+        CUDA_VERSION: ${CUDA_VERSION:-12.6}
       cache_from:
         - slurm-docker-cluster:${SLURM_VERSION:-25.11.2}
     command: ["slurmdbd"]
@@ -59,14 +61,15 @@ services:
     privileged: true
     working_dir: /data
     environment:
-      - ELASTICSEARCH_HOST=${ELASTICSEARCH_HOST:-}
-      - SSH_ENABLE=${SSH_ENABLE:-false}
+      ELASTICSEARCH_HOST: ${ELASTICSEARCH_HOST:-}
+      GPU_COUNT: ${GPU_COUNT:-1}
+      SSH_ENABLE: ${SSH_ENABLE:-false}
     volumes:
       - etc_munge:/etc/munge:z
       - etc_slurm:/etc/slurm:z
       - slurm_jobdir:/data:z
       - var_log_slurm:/var/log/slurm:z
-      - ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z      
+      - ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z
     ports:
       - "${SSH_PORT:-3022}:22"
     expose:
@@ -161,6 +164,42 @@ services:
       retries: 3
       start_period: 20s
 
+  # GPU compute node (NVIDIA)
+  # Requires: GPU_ENABLE=true in .env and nvidia-container-toolkit on host
+  g1:
+    image: slurm-docker-cluster:${SLURM_VERSION:-25.11.2}
+    command: ["slurmd"]
+    container_name: g1
+    hostname: g1
+    working_dir: /data
+    privileged: true
+    profiles: ["gpu"]
+    volumes:
+      - etc_munge:/etc/munge
+      - etc_slurm:/etc/slurm
+      - slurm_jobdir:/data
+      - var_log_slurm:/var/log/slurm
+    expose:
+      - "6818"
+    depends_on:
+      slurmctld:
+        condition: service_healthy
+    networks:
+      - slurm-network
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: ${GPU_COUNT:-1}
+              capabilities: [gpu, compute, utility]
+    healthcheck:
+      test: ["CMD-SHELL", "pidof slurmd"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 20s
+
   # Optional Elasticsearch for job completion logging
   # Start with: docker compose --profile monitoring up -d
   #   or with: make up-with-monitoring
diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh
@@ -86,6 +86,12 @@ then
         echo "---> Job completion configured for Elasticsearch"
     fi
 
+    # Sync GPU count in slurm.conf with GPU_COUNT env var
+    if grep -q "Gres=gpu:nvidia:" /etc/slurm/slurm.conf 2>/dev/null; then
+        sed -i "s/Gres=gpu:nvidia:[0-9]*/Gres=gpu:nvidia:${GPU_COUNT:-1}/" /etc/slurm/slurm.conf
+        echo "---> Configured Slurm GPU GRES count to ${GPU_COUNT:-1}"
+    fi
+
     echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
     exec gosu slurm /usr/sbin/slurmctld -i -Dvvv
 fi
diff --git a/examples/jobs/gpu_test.sh b/examples/jobs/gpu_test.sh
diff --git a/test_gpu.sh b/test_gpu.sh