From 02ba413b650e23006cb641515a81d9830202de34 Mon Sep 17 00:00:00 2001 From: Colin Hirsch Date: Thu, 11 Dec 2025 14:53:42 +0100 Subject: [PATCH 1/6] CI: Enable tests on DGX. --- .ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md | 3 +++ .ci/jenkins/lib/test-matrix.yaml | 2 +- .gitlab/test_cpp.sh | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md index f3b59dc8a8..11d58da8ac 100644 --- a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md +++ b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md @@ -34,8 +34,11 @@ sudo reboot If running on nvlink hosts like DGX we should also install fabric manager ```bash sudo apt install nvidia-fabricmanager- # should be same as kernel version nvidia-fabricmanager-575 +sudo systemctl enable --now nvidia-fabricmanager ``` +**Important**: On NVSwitch-enabled systems (DGX A100, H100), `nvidia-fabricmanager` must be running before GPU initialization. Without it, CUDA will fail with "system not yet initialized" errors. Ensure the service is enabled at boot. + Verify with `nvidia-smi`. Driver compatibility is critical for RDMA support[^1_1][^1_3]. diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 91869c4a4a..83f49ff047 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -25,7 +25,7 @@ timeout_minutes: 240 # label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file runs_on_agents: - {nodeLabel: 'H100'} - # - {nodeLabel: 'DGX'} + - {nodeLabel: 'DGX'} matrix: axes: diff --git a/.gitlab/test_cpp.sh b/.gitlab/test_cpp.sh index ea69d63886..383dfd3e45 100755 --- a/.gitlab/test_cpp.sh +++ b/.gitlab/test_cpp.sh @@ -91,7 +91,8 @@ if $TEST_LIBFABRIC ; then ./bin/nixl_example LIBFABRIC fi ./bin/nixl_etcd_example -./bin/ucx_backend_test +# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once NIXL uses a version of UCX withUCX#11049. +UCX_GDR_COPY_SHARED_MD=n ./bin/ucx_backend_test mkdir -p /tmp/telemetry_test NIXL_TELEMETRY_ENABLE=y NIXL_TELEMETRY_DIR=/tmp/telemetry_test ./bin/agent_example & sleep 1 From 49fa629c304e6dc3d8f7b6c1cff40078ab3307cb Mon Sep 17 00:00:00 2001 From: Colin Hirsch Date: Thu, 11 Dec 2025 17:00:41 +0100 Subject: [PATCH 2/6] CI: Enable tests on DGX. --- .gitlab/test_cpp.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/test_cpp.sh b/.gitlab/test_cpp.sh index 383dfd3e45..b27146f34c 100755 --- a/.gitlab/test_cpp.sh +++ b/.gitlab/test_cpp.sh @@ -91,7 +91,7 @@ if $TEST_LIBFABRIC ; then ./bin/nixl_example LIBFABRIC fi ./bin/nixl_etcd_example -# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once NIXL uses a version of UCX withUCX#11049. +# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once UCX is fixed. UCX_GDR_COPY_SHARED_MD=n ./bin/ucx_backend_test mkdir -p /tmp/telemetry_test NIXL_TELEMETRY_ENABLE=y NIXL_TELEMETRY_DIR=/tmp/telemetry_test ./bin/agent_example & From 45d00ad97a1120ac9e2b8cf0ce05fa26318576f9 Mon Sep 17 00:00:00 2001 From: Colin Hirsch Date: Sat, 13 Dec 2025 10:09:16 +0100 Subject: [PATCH 3/6] Add workaround. --- .gitlab/test_nixlbench.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab/test_nixlbench.sh b/.gitlab/test_nixlbench.sh index 2c4ba4335f..169eb18110 100755 --- a/.gitlab/test_nixlbench.sh +++ b/.gitlab/test_nixlbench.sh @@ -59,6 +59,9 @@ sleep 5 echo "==== Running Nixlbench tests ====" cd ${INSTALL_DIR} +# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once UCX is fixed. +export UCX_GDR_COPY_SHARED_MD=n + DEFAULT_NB_PARAMS="--filepath /tmp --total_buffer_size 80000000 --start_block_size 4096 --max_block_size 16384 --start_batch_size 1 --max_batch_size 4" run_nixlbench() { From f4803f709c5b0669d79079249c922b3caa4c15b1 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Mon, 22 Dec 2025 13:48:07 +0100 Subject: [PATCH 4/6] CI: Increase GPU tests timeout --- .ci/jenkins/lib/test-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 84036c0948..ef5e6ef150 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -43,7 +43,7 @@ env: CONTAINER_WORKSPACE: /workspace INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install # Manual timeout - ci-demo doesn't handle docker exec - TEST_TIMEOUT: 30 + TEST_TIMEOUT: 60 # NPROC for bare-metal: containers see all host CPUs, need to limit parallelism NPROC: 16 From 67dcbdf15289cff5bb83671c007d72a8d9e3c580 Mon Sep 17 00:00:00 2001 From: Colin Hirsch Date: Fri, 9 Jan 2026 12:53:10 +0100 Subject: [PATCH 5/6] Print environment. --- .ci/jenkins/lib/test-matrix.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index ef5e6ef150..012605f165 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -92,7 +92,9 @@ steps: done fi fi - + # print environment + set + env - name: Build GPU Test Environment parallel: false From 2adbc0d24777d3c819783e299eb43d84c39756e8 Mon Sep 17 00:00:00 2001 From: Colin Hirsch Date: Wed, 14 Jan 2026 18:46:51 +0100 Subject: [PATCH 6/6] Test with single test. --- .ci/jenkins/lib/test-matrix.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 012605f165..a0c671af09 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -35,7 +35,8 @@ matrix: - x86_64 ucx_version: - master - - v1.20.x + +# - v1.20.x taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}"