stackhpc · sjpb · Jun 24, 2025 · Apr 23, 2025 · Jun 11, 2025 · Jun 11, 2025
@@ -32,11 +32,11 @@ jobs:
           - image_name: openhpc-extra-RL8
             source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
             inventory_groups: doca,cuda,lustre
-            volume_size: 30 # needed for cuda
+            volume_size: 35 # needed for cuda
           - image_name: openhpc-extra-RL9
             source_image_name_key: RL9
             inventory_groups: doca,cuda,lustre
-            volume_size: 30 # needed for cuda
+            volume_size: 35 # needed for cuda
     env:
       ANSIBLE_FORCE_COLOR: True
       OS_CLOUD: openstack

@@ -90,5 +90,7 @@ roles/*
 !roles/gateway/**
 !roles/alertmanager/
 !roles/alertmanager/**
+!roles/slurm_recompile/**
+!roles/slurm_recompile/**
 !roles/nhc/
 !roles/nhc/**
@@ -48,6 +48,20 @@
         name: cuda
         tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}"
 
+- name: Setup vGPU
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}"
+  handlers:
+    - name: reboot
+      fail:
+        msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable.
+
 - name: Persist hostkeys across rebuilds
   # Must be after filesystems.yml (for storage)
   # and before portal.yml (where OOD login node hostkeys are scanned)

@@ -250,6 +250,16 @@
         name: cloudalchemy.grafana
         tasks_from: install.yml
 
+- name: Add support for NVIDIA GPU auto detection to Slurm
+  hosts: cuda
+  become: yes
+  tasks:
+    - name: Recompile slurm
+      import_role:
+        name: slurm_recompile
+      vars:
+        slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}"
+
 - name: Run post.yml hook
   vars:
     appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"

@@ -75,6 +75,7 @@ it also requires an image build with the role name added to the
 | extras.yml               | basic_users             | All functionality [6]           | No                  |
 | extras.yml               | eessi                   | All functionality [7]           | No                  |
 | extras.yml               | cuda                    | None required - use image build | Yes [8]             |
+| extras.yml               | vgpu                    | All functionality               | Yes                 |
 | extras.yml               | persist_hostkeys        | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | compute_init (export)   | Not relevant for compute nodes  | n/a                 |
 | extras.yml               | k9s (install)           | Not relevant during boot        | n/a                 |

@@ -19,6 +19,7 @@
     enable_basic_users: "{{ os_metadata.meta.basic_users | default(false) | bool }}"
     enable_eessi: "{{ os_metadata.meta.eessi | default(false) | bool }}"
     enable_chrony: "{{ os_metadata.meta.chrony | default(false) | bool }}"
+    enable_vgpu: "{{ os_metadata.meta.vpgu | default(false) | bool }}"
     enable_nhc: "{{ os_metadata.meta.nhc | default(false) | bool }}"
 
     # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects
@@ -296,6 +297,12 @@
             cmd: "cvmfs_config setup"
       when: enable_eessi
 
+    - name: Configure VGPUs
+      include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: 'configure.yml'
+      when: enable_vgpu
+
     # NB: don't need conditional block on enable_compute as have already exited
     # if not the case
     - name: Write Munge key

@@ -0,0 +1,4 @@
+---
+- name: Set cuda_facts_version_short
+  set_fact:
+    cuda_facts_version_short: "{{ cuda_version_short }}"
@@ -0,0 +1,28 @@
+# slurm_recompile
+=================
+
+Recompiles slurm from source RPMs and installs the packages that were built.
+
+Requirements
+------------
+
+Role Variables
+--------------
+
+See `defaults/main.yml`.
+
+Dependencies
+------------
+
+Example Playbook
+----------------
+
+    - hosts: compute
+      tasks:
+        - import_role:
+            name: slurm_recompile
+
+License
+-------
+
+Apache-2.0
@@ -0,0 +1,4 @@
+---
+# Whether to link slurm against the NVIDIA management library
+slurm_recompile_with_nvml: false
+
@@ -0,0 +1,41 @@
+---
+- name: Get facts about CUDA installation
+  import_role:
+    name: cuda
+    tasks_from: facts.yml
+
+- name: Gather the package facts
+  ansible.builtin.package_facts:
+    manager: auto
+
+- name: Set fact containing slurm package facts
+  set_fact:
+    slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}"
+
+- name: Recompile and install slurm packages
+  shell: |
+    #!/bin/bash
+    source /etc/profile
+    set -eux
+    dnf download -y --source slurm-slurmd-ohpc-{{ slurm_package.version }}-{{ slurm_package.release }}
+    rpm -i slurm-ohpc-*.src.rpm
+    cd /root/rpmbuild/SPECS
+    dnf builddep -y slurm.spec
+    rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec
+    dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm
+  become: true
+
+- name: Workaround missing symlink
+  # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY
+  command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so
+  args:
+    creates: /lib64/libnvidia-ml.so
+  when: slurm_recompile_with_nvml | bool
+
+- name: Cleanup Dependencies
+  shell: |
+    #!/bin/bash
+    set -eux
+    set -o pipefail
+    dnf history list | grep Install | grep 'builddep -y slurm.spec' | head -n 1 |  awk '{print $1}' | xargs dnf history -y undo
+  become: true
@@ -133,3 +133,13 @@
     - import_role:
         name: lustre
         tasks_from: validate.yml
+
+- name: Validate vGPU configuration
+  hosts: vgpu
+  become: yes
+  gather_facts: yes
+  tags: vgpu
+  tasks:
+    - include_role:
+        name: stackhpc.linux.vgpu
+        tasks_from: validate.yml