diff --git a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md index f3b59dc8a8..11d58da8ac 100644 --- a/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md +++ b/.ci/docs/setup_nvidia_gpu_with_rdma_support_on_ubuntu.md @@ -34,8 +34,11 @@ sudo reboot If running on nvlink hosts like DGX we should also install fabric manager ```bash sudo apt install nvidia-fabricmanager- # should be same as kernel version nvidia-fabricmanager-575 +sudo systemctl enable --now nvidia-fabricmanager ``` +**Important**: On NVSwitch-enabled systems (DGX A100, H100), `nvidia-fabricmanager` must be running before GPU initialization. Without it, CUDA will fail with "system not yet initialized" errors. Ensure the service is enabled at boot. + Verify with `nvidia-smi`. Driver compatibility is critical for RDMA support[^1_1][^1_3]. diff --git a/.ci/jenkins/lib/test-matrix.yaml b/.ci/jenkins/lib/test-matrix.yaml index 2284b1724b..a0c671af09 100644 --- a/.ci/jenkins/lib/test-matrix.yaml +++ b/.ci/jenkins/lib/test-matrix.yaml @@ -25,7 +25,7 @@ timeout_minutes: 240 # label is defined at jenkins slave configuration, we want to run the job on a gpu agent and be able to esaly replace it without having to change this file runs_on_agents: - {nodeLabel: 'H100'} - # - {nodeLabel: 'DGX'} + - {nodeLabel: 'DGX'} matrix: axes: @@ -35,7 +35,8 @@ matrix: - x86_64 ucx_version: - master - - v1.20.x + +# - v1.20.x taskName: "${name}/${arch}/ucx-${ucx_version}/${axis_index}" @@ -43,7 +44,7 @@ env: CONTAINER_WORKSPACE: /workspace INSTALL_DIR: ${CONTAINER_WORKSPACE}/nixl_install # Manual timeout - ci-demo doesn't handle docker exec - TEST_TIMEOUT: 30 + TEST_TIMEOUT: 60 # NPROC for bare-metal: containers see all host CPUs, need to limit parallelism NPROC: 16 @@ -92,7 +93,9 @@ steps: done fi fi - + # print environment + set + env - name: Build GPU Test Environment parallel: false diff --git a/.gitlab/test_cpp.sh b/.gitlab/test_cpp.sh index 8c911134ea..8ec97a04f2 100755 --- a/.gitlab/test_cpp.sh +++ b/.gitlab/test_cpp.sh @@ -92,7 +92,8 @@ if $TEST_LIBFABRIC ; then ./bin/nixl_example LIBFABRIC fi ./bin/nixl_etcd_example -./bin/ucx_backend_test +# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once UCX is fixed. +UCX_GDR_COPY_SHARED_MD=n ./bin/ucx_backend_test mkdir -p /tmp/telemetry_test NIXL_TELEMETRY_ENABLE=y NIXL_TELEMETRY_DIR=/tmp/telemetry_test ./bin/agent_example & sleep 5 diff --git a/.gitlab/test_nixlbench.sh b/.gitlab/test_nixlbench.sh index 6585b8346a..8d84fed519 100755 --- a/.gitlab/test_nixlbench.sh +++ b/.gitlab/test_nixlbench.sh @@ -60,6 +60,9 @@ wait_for_etcd echo "==== Running Nixlbench tests ====" cd ${INSTALL_DIR} +# TODO: Remove UCX_GDR_COPY_SHARED_MD=n once UCX is fixed. +export UCX_GDR_COPY_SHARED_MD=n + DEFAULT_NB_PARAMS="--filepath /tmp --total_buffer_size 80000000 --start_block_size 4096 --max_block_size 16384 --start_batch_size 1 --max_batch_size 4" run_nixlbench() {