Skip to content

Commit f867663

Browse files
authored
feat: add nvidia gpu support (#86)
1 parent dd27df3 commit f867663

File tree

12 files changed

+514
-14
lines changed

12 files changed

+514
-14
lines changed

.env.example

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,15 @@ MYSQL_DATABASE=slurm_acct_db
2424
# Leave commented for default file-based job completion logging
2525
#ELASTICSEARCH_HOST=http://elasticsearch:9200
2626

27-
# sshd options
27+
# SSHD options
2828
SSH_ENABLE=false
2929
SSH_AUTHORIZED_KEYS=$HOME/.ssh/authorized_keys
3030
SSH_PORT=3022
31+
32+
# Uncomment to enable NVIDIA GPU support with dedicated GPU node 'g1' (OPTIONAL)
33+
# Host requirement: nvidia-container-toolkit (one-time install)
34+
# CUDA toolkit is installed in the container (no host CUDA needed)
35+
# When enabled, 'make up' automatically starts GPU node with GRES configuration
36+
#GPU_ENABLE=true
37+
#GPU_COUNT=1
38+
#CUDA_VERSION=12.6

Dockerfile

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ RUN set -ex \
1919
&& dnf -y install dnf-plugins-core epel-release \
2020
&& dnf config-manager --set-enabled crb \
2121
&& dnf makecache \
22-
&& dnf -y install \
22+
&& dnf -y install --nobest --exclude='*.i686' \
2323
autoconf \
2424
automake \
2525
bzip2 \
@@ -92,6 +92,8 @@ LABEL org.opencontainers.image.source="https://github.com/giovtorres/slurm-docke
9292

9393
ARG SLURM_VERSION
9494
ARG TARGETARCH
95+
ARG GPU_ENABLE
96+
ARG CUDA_VERSION
9597

9698
# Enable CRB and EPEL repositories, then install runtime dependencies
9799
RUN set -ex \
@@ -152,6 +154,24 @@ RUN set -ex \
152154
&& gosu --version \
153155
&& gosu nobody true
154156

157+
# Conditionally install CUDA toolkit for GPU support
158+
RUN if [ "$GPU_ENABLE" = "true" ]; then \
159+
set -ex && \
160+
echo "Installing CUDA ${CUDA_VERSION} runtime for GPU support..." && \
161+
dnf config-manager --add-repo \
162+
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo && \
163+
dnf -y install \
164+
cuda-nvml-devel-$(echo ${CUDA_VERSION} | tr '.' '-') \
165+
cuda-cudart-$(echo ${CUDA_VERSION} | tr '.' '-') \
166+
cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') \
167+
nvidia-driver-cuda-libs && \
168+
dnf clean all && \
169+
rm -rf /var/cache/dnf && \
170+
echo "CUDA ${CUDA_VERSION} installed successfully"; \
171+
else \
172+
echo "GPU support disabled, skipping CUDA installation"; \
173+
fi
174+
155175
COPY --from=builder /root/rpmbuild/RPMS/*/*.rpm /tmp/rpms/
156176

157177
# Install Slurm RPMs
@@ -208,6 +228,14 @@ RUN set -ex \
208228
echo "Using common cgroup.conf"; \
209229
cp /tmp/slurm-config/common/cgroup.conf /etc/slurm/cgroup.conf; \
210230
fi \
231+
&& if [ "$GPU_ENABLE" = "true" ]; then \
232+
echo "GPU support enabled, installing gres.conf"; \
233+
cp /tmp/slurm-config/common/gres.conf /etc/slurm/gres.conf; \
234+
chown slurm:slurm /etc/slurm/gres.conf; \
235+
chmod 644 /etc/slurm/gres.conf; \
236+
else \
237+
echo "GPU support disabled, skipping gres.conf"; \
238+
fi \
211239
&& chown slurm:slurm /etc/slurm/slurm.conf /etc/slurm/cgroup.conf /etc/slurm/slurmdbd.conf \
212240
&& chmod 644 /etc/slurm/slurm.conf /etc/slurm/cgroup.conf \
213241
&& chmod 600 /etc/slurm/slurmdbd.conf \

Makefile

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: help build build-no-cache up start down clean logs test test-monitoring status shell logs-slurmctld logs-slurmdbd update-slurm reload-slurm version set-version build-all test-all test-version rebuild jobs quick-test run-examples
1+
.PHONY: help build build-no-cache up start down clean logs test test-monitoring test-gpu status shell logs-slurmctld logs-slurmdbd update-slurm reload-slurm version set-version build-all test-all test-version rebuild jobs quick-test run-examples
22

33
# Default target
44
.DEFAULT_GOAL := help
@@ -8,10 +8,19 @@ SUPPORTED_VERSIONS := 24.11.7 25.05.6 25.11.2
88
# Read default version from .env.example (source of truth)
99
DEFAULT_VERSION := $(shell grep '^SLURM_VERSION=' .env.example | cut -d= -f2)
1010

11-
# Auto-detect monitoring profile based on .env configuration
12-
# If ELASTICSEARCH_HOST is set, automatically enable monitoring profile
11+
# Auto-detect profiles based on .env configuration
1312
ELASTICSEARCH_HOST := $(shell grep -E '^ELASTICSEARCH_HOST=' .env 2>/dev/null | cut -d= -f2)
14-
PROFILE_FLAG := $(if $(ELASTICSEARCH_HOST),--profile monitoring,)
13+
GPU_ENABLE := $(shell grep -E '^GPU_ENABLE=' .env 2>/dev/null | cut -d= -f2)
14+
15+
# Build profile flags
16+
PROFILES :=
17+
ifdef ELASTICSEARCH_HOST
18+
PROFILES += --profile monitoring
19+
endif
20+
ifeq ($(GPU_ENABLE),true)
21+
PROFILES += --profile gpu
22+
endif
23+
PROFILE_FLAG := $(PROFILES)
1524

1625
# Colors for help output
1726
CYAN := $(shell tput -Txterm setaf 6)
@@ -44,6 +53,7 @@ help: ## Show this help message
4453
@printf " ${CYAN}%-15s${RESET} %s\n" "shell" "Open shell in slurmctld"
4554
@printf " ${CYAN}%-15s${RESET} %s\n" "test" "Run test suite"
4655
@printf " ${CYAN}%-15s${RESET} %s\n" "test-monitoring" "Run monitoring profile tests"
56+
@printf " ${CYAN}%-15s${RESET} %s\n" "test-gpu" "Run GPU profile tests"
4757
@printf " ${CYAN}%-15s${RESET} %s\n" "quick-test" "Submit a quick test job"
4858
@printf " ${CYAN}%-15s${RESET} %s\n" "run-examples" "Run example jobs"
4959
@echo ""
@@ -62,6 +72,10 @@ help: ## Show this help message
6272
@echo "Monitoring:"
6373
@echo " Enable: Set ELASTICSEARCH_HOST=http://elasticsearch:9200 in .env"
6474
@echo " Disable: Comment out or remove ELASTICSEARCH_HOST from .env"
75+
@echo ""
76+
@echo "GPU Support (NVIDIA):"
77+
@echo " Enable: Set GPU_ENABLE=true in .env (requires nvidia-container-toolkit on host)"
78+
@echo " Disable: Set GPU_ENABLE=false or remove GPU_ENABLE from .env"
6579

6680
build: ## Build Docker images
6781
docker compose --progress plain build
@@ -87,6 +101,9 @@ test: ## Run test suite
87101
test-monitoring: ## Run monitoring profile test suite
88102
./test_monitoring.sh
89103

104+
test-gpu: ## Run GPU profile test suite
105+
./test_gpu.sh
106+
90107
status: ## Show cluster status
91108
@echo "=== Containers ==="
92109
@docker compose ps

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ make help
2828

2929
```bash
3030
cp .env.example .env
31-
# Edit .env to change SLURM_VERSION or enable ELASTICSEARCH_HOST
31+
# Edit .env to change SLURM_VERSION, enable ELASTICSEARCH_HOST, or enable GPU_ENABLE
3232
make up
3333
```
3434

@@ -45,6 +45,7 @@ make up
4545
- **slurmctld** - Controller for job scheduling
4646
- **slurmrestd** - REST API daemon (HTTP/JSON access)
4747
- **c1, c2** - Compute nodes
48+
- **g1** - (optional) GPU compute node with NVIDIA support
4849
- **elasticsearch** - (optional) indexing jobs
4950
- **kibana** - (optional) visualization for elasticsearch
5051

@@ -114,6 +115,34 @@ make test-monitoring
114115

115116
**Indexed data:** Job ID, user, partition, state, times, nodes, exit code
116117

118+
## 🎮 GPU Support (NVIDIA)
119+
120+
Enable optional NVIDIA GPU support with dedicated GPU node:
121+
122+
```bash
123+
# 1. One-time host setup (add NVIDIA repo and install nvidia-container-toolkit)
124+
curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \
125+
| sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
126+
sudo dnf install -y nvidia-container-toolkit
127+
sudo nvidia-ctk runtime configure --runtime=docker
128+
sudo systemctl restart docker
129+
130+
# 2. Enable GPU in .env (CUDA toolkit installed in container automatically)
131+
GPU_ENABLE=true
132+
CUDA_VERSION=12.6 # Optional, defaults to 12.6
133+
134+
# 3. Build with GPU support
135+
make rebuild
136+
137+
# 4. Verify GPU detection
138+
docker exec g1 nvidia-smi
139+
140+
# Test GPU functionality
141+
make test-gpu
142+
```
143+
144+
> **Note:** GPU testing is not included in CI (GitHub-hosted runners have no GPUs). Run `make test-gpu` manually on a host with an NVIDIA GPU and `nvidia-container-toolkit` installed.
145+
117146
## 🔄 Cluster Management
118147

119148
```bash

config/24.11/slurm.conf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ SlurmctldHost=slurmctld
1515
#EpilogSlurmctld=
1616
#FirstJobId=1
1717
#MaxJobId=67043328
18-
#GresTypes=
18+
GresTypes=gpu
1919
#GroupUpdateForce=0
2020
#GroupUpdateTime=600
2121
#JobFileAppend=0
@@ -42,6 +42,7 @@ ProctrackType=proctrack/linuxproc
4242
#PropagateResourceLimitsExcept=
4343
#RebootProgram=
4444
ReturnToService=1
45+
SlurmdParameters=config_overrides
4546
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
4647
SlurmctldPort=6817
4748
SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -151,8 +152,11 @@ AuthType=auth/munge
151152
# COMPUTE NODES
152153
NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
153154
NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
155+
# GPU node (only available when GPU_ENABLE=true in .env)
156+
NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
154157

155158
PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
159+
PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
156160

157161
# add JWT as an alternative authentication type
158162
AuthAltTypes=auth/jwt

config/25.05/slurm.conf

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ SlurmctldHost=slurmctld
1414
#EpilogSlurmctld=
1515
#FirstJobId=1
1616
#MaxJobId=67043328
17-
#GresTypes=
17+
GresTypes=gpu
1818
#GroupUpdateForce=0
1919
#GroupUpdateTime=600
2020
#JobFileAppend=0
@@ -41,6 +41,7 @@ ProctrackType=proctrack/linuxproc
4141
#PropagateResourceLimitsExcept=
4242
#RebootProgram=
4343
ReturnToService=1
44+
SlurmdParameters=config_overrides
4445
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
4546
SlurmctldPort=6817
4647
SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -150,8 +151,11 @@ AuthType=auth/munge
150151
# COMPUTE NODES
151152
NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
152153
NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
154+
# GPU node (only available when GPU_ENABLE=true in .env)
155+
NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
153156

154157
PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
158+
PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
155159

156160
# add JWT as an alternative authentication type
157161
AuthAltTypes=auth/jwt

config/25.11/slurm.conf

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ SlurmctldHost=slurmctld
1414
#EpilogSlurmctld=
1515
#FirstJobId=1
1616
#MaxJobId=67043328
17-
#GresTypes=
17+
GresTypes=gpu
1818
#GroupUpdateForce=0
1919
#GroupUpdateTime=600
2020
#JobFileAppend=0
@@ -41,6 +41,7 @@ ProctrackType=proctrack/linuxproc
4141
#PropagateResourceLimitsExcept=
4242
#RebootProgram=
4343
ReturnToService=1
44+
SlurmdParameters=config_overrides
4445
SlurmctldPidFile=/var/run/slurm/slurmctld.pid
4546
SlurmctldPort=6817
4647
SlurmdPidFile=/var/run/slurm/slurmd.pid
@@ -150,9 +151,12 @@ AuthType=auth/munge
150151
# COMPUTE NODES
151152
NodeName=c1 CPUs=2 RealMemory=1000 State=UNKNOWN
152153
NodeName=c2 CPUs=2 RealMemory=1000 State=UNKNOWN
154+
# GPU node (only available when GPU_ENABLE=true in .env)
155+
NodeName=g1 Gres=gpu:nvidia:1 CPUs=4 RealMemory=2000 State=UNKNOWN
153156

154157
PartitionName=normal Nodes=c1,c2 Default=YES MaxTime=INFINITE State=UP
158+
PartitionName=gpu Nodes=g1 Default=NO MaxTime=INFINITE State=UP
155159

156160
# add JWT as an alternative authentication type
157161
AuthAltTypes=auth/jwt
158-
AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key
162+
AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key

config/common/gres.conf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# gres.conf - Generic Resource (GRES) configuration
2+
# Defines GPU resources available on compute nodes
3+
#
4+
# This file is only used when GPU_ENABLE=true in .env
5+
# GPU node 'g1' is configured with NVIDIA GPUs
6+
7+
# GPU node configuration
8+
NodeName=g1 Name=gpu Type=nvidia File=/dev/nvidia[0-9] AutoDetect=nvml

docker-compose.yml

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ services:
2525
context: .
2626
args:
2727
SLURM_VERSION: ${SLURM_VERSION:-25.11.2}
28+
GPU_ENABLE: ${GPU_ENABLE:-false}
29+
CUDA_VERSION: ${CUDA_VERSION:-12.6}
2830
cache_from:
2931
- slurm-docker-cluster:${SLURM_VERSION:-25.11.2}
3032
command: ["slurmdbd"]
@@ -59,14 +61,15 @@ services:
5961
privileged: true
6062
working_dir: /data
6163
environment:
62-
- ELASTICSEARCH_HOST=${ELASTICSEARCH_HOST:-}
63-
- SSH_ENABLE=${SSH_ENABLE:-false}
64+
ELASTICSEARCH_HOST: ${ELASTICSEARCH_HOST:-}
65+
GPU_COUNT: ${GPU_COUNT:-1}
66+
SSH_ENABLE: ${SSH_ENABLE:-false}
6467
volumes:
6568
- etc_munge:/etc/munge:z
6669
- etc_slurm:/etc/slurm:z
6770
- slurm_jobdir:/data:z
6871
- var_log_slurm:/var/log/slurm:z
69-
- ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z
72+
- ${SSH_AUTHORIZED_KEYS:-/dev/null}:/tmp/authorized_keys_host:ro,z
7073
ports:
7174
- "${SSH_PORT:-3022}:22"
7275
expose:
@@ -161,6 +164,42 @@ services:
161164
retries: 3
162165
start_period: 20s
163166

167+
# GPU compute node (NVIDIA)
168+
# Requires: GPU_ENABLE=true in .env and nvidia-container-toolkit on host
169+
g1:
170+
image: slurm-docker-cluster:${SLURM_VERSION:-25.11.2}
171+
command: ["slurmd"]
172+
container_name: g1
173+
hostname: g1
174+
working_dir: /data
175+
privileged: true
176+
profiles: ["gpu"]
177+
volumes:
178+
- etc_munge:/etc/munge
179+
- etc_slurm:/etc/slurm
180+
- slurm_jobdir:/data
181+
- var_log_slurm:/var/log/slurm
182+
expose:
183+
- "6818"
184+
depends_on:
185+
slurmctld:
186+
condition: service_healthy
187+
networks:
188+
- slurm-network
189+
deploy:
190+
resources:
191+
reservations:
192+
devices:
193+
- driver: nvidia
194+
count: ${GPU_COUNT:-1}
195+
capabilities: [gpu, compute, utility]
196+
healthcheck:
197+
test: ["CMD-SHELL", "pidof slurmd"]
198+
interval: 10s
199+
timeout: 5s
200+
retries: 3
201+
start_period: 20s
202+
164203
# Optional Elasticsearch for job completion logging
165204
# Start with: docker compose --profile monitoring up -d
166205
# or with: make up-with-monitoring

docker-entrypoint.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ then
8686
echo "---> Job completion configured for Elasticsearch"
8787
fi
8888

89+
# Sync GPU count in slurm.conf with GPU_COUNT env var
90+
if grep -q "Gres=gpu:nvidia:" /etc/slurm/slurm.conf 2>/dev/null; then
91+
sed -i "s/Gres=gpu:nvidia:[0-9]*/Gres=gpu:nvidia:${GPU_COUNT:-1}/" /etc/slurm/slurm.conf
92+
echo "---> Configured Slurm GPU GRES count to ${GPU_COUNT:-1}"
93+
fi
94+
8995
echo "---> Starting the Slurm Controller Daemon (slurmctld) ..."
9096
exec gosu slurm /usr/sbin/slurmctld -i -Dvvv
9197
fi

0 commit comments

Comments
 (0)