Skip to content

Commit 858d4c3

Browse files
committed
Merge branch 'main' into pylint-ci
2 parents e4990fa + 43ffb37 commit 858d4c3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+862
-254
lines changed

.ci/dockerfiles/Dockerfile.gpu_test

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# _LOGIN: Username (default: svc-nixl)
2020
# _GROUP: Group name (default: hardware)
2121
# _HOME: Home directory path (default: /home/svc-nixl)
22+
# WORKSPACE: Workspace directory path
2223
#
2324

2425
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.02-py3
@@ -31,6 +32,7 @@ ARG _GID=30
3132
ARG _LOGIN=svc-nixl
3233
ARG _GROUP=hardware
3334
ARG _HOME=/home/$_LOGIN
35+
ARG WORKSPACE
3436

3537
# Labels for documentation
3638
LABEL maintainer="NVIDIA NIXL Team"
@@ -57,6 +59,9 @@ RUN mkdir -p /etc/sudoers.d && \
5759
chmod 440 /etc/sudoers.d/${_LOGIN} && \
5860
chown root:root /etc/sudoers.d/${_LOGIN}
5961

62+
# Copy workspace into container (workaround for files disappearing from workspace)
63+
COPY --chown="${_UID}":"${_GID}" . ${WORKSPACE}
64+
6065
# Set working directory
6166
WORKDIR ${_HOME}
6267

.ci/jenkins/lib/test-matrix.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ steps:
6565
- name: Build GPU Test Environment
6666
parallel: false
6767
run: |
68-
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} .
68+
docker build -t "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}" -f .ci/dockerfiles/Dockerfile.gpu_test --build-arg BASE_IMAGE=${image} --build-arg WORKSPACE=${WORKSPACE} .
6969
onfail: docker image rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
7070

7171
- name: Run GPU Test Environment
@@ -79,8 +79,6 @@ steps:
7979
--gpus all \
8080
--device=/dev/infiniband \
8181
--device=/dev/gdrdrv \
82-
-v ${WORKSPACE}:${WORKSPACE} \
83-
-w ${WORKSPACE} \
8482
"${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"
8583
onfail: |
8684
docker rm -f "${JOB_BASE_NAME}-${BUILD_ID}-${axis_index}"

.ci/jenkins/pipeline/proj-jjb.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,7 @@
252252
- string:
253253
name: "NIXL_VERSION"
254254
default: "{jjb_branch}"
255-
description: "NIXL version to use (tag like 0.4.1, branch name, or commit hash)"
255+
description: "NIXL version to use (tag like 0.5.0, branch name, or commit hash)"
256256
- string:
257257
name: "UCX_VERSION"
258258
default: "v1.19.x"

.ci/scripts/common.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/bin/bash
2+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
#
18+
# Common functions for CI scripts
19+
#
20+
21+
#
22+
# Set initial port number for client/server applications to be updated with
23+
# function below
24+
#
25+
tcp_port_range=1000
26+
min_port_number=10500
27+
max_port_number=65535
28+
29+
# GITLAB CI
30+
if [ -n "$CI_CONCURRENT_ID" ]; then
31+
nixl_concurrent_id=$CI_CONCURRENT_ID
32+
# Jenkins CI
33+
elif [ -n "$EXECUTOR_NUMBER" ]; then
34+
nixl_concurrent_id=$EXECUTOR_NUMBER
35+
else
36+
# Fallback to random number if both CI_CONCURRENT_ID and EXECUTOR_NUMBER are not set
37+
nixl_concurrent_id=$((RANDOM % $(((max_port_number - min_port_number) / tcp_port_range))))
38+
fi
39+
40+
echo nixl_concurrent_id="$nixl_concurrent_id"
41+
42+
# First half of the port range is used for shell script tests
43+
tcp_port_min=$((min_port_number + nixl_concurrent_id * tcp_port_range))
44+
tcp_port_max=$((tcp_port_min + tcp_port_range / 2))
45+
46+
get_next_tcp_port() {
47+
local port_file="/tmp/nixl_tcp_port_${nixl_concurrent_id}"
48+
49+
if [ ! -f "$port_file" ]; then
50+
echo "$tcp_port_min" > "$port_file"
51+
fi
52+
53+
local current_port
54+
current_port=$(cat "$port_file")
55+
local next_port=$((current_port + 1))
56+
57+
# Check if the port is already in use
58+
while ss -tuln | grep -q :$next_port; do
59+
next_port=$((next_port + 1))
60+
done
61+
62+
if [ "$next_port" -ge "$tcp_port_max" ]; then
63+
next_port="$tcp_port_min"
64+
fi
65+
66+
echo "$next_port" > "$port_file"
67+
68+
echo "$next_port"
69+
}
70+
71+
# Second half of the port range is used for gtest
72+
gtest_offset=$((tcp_port_range / 2))
73+
# shellcheck disable=SC2034
74+
min_gtest_port=$((tcp_port_min + gtest_offset))
75+
# shellcheck disable=SC2034
76+
max_gtest_port=$((tcp_port_max + gtest_offset))

.gitlab/test_cpp.sh

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
18+
# shellcheck disable=SC1091
19+
. "$(dirname "$0")/../.ci/scripts/common.sh"
20+
1721
set -e
1822
set -x
1923
TEXT_YELLOW="\033[1;33m"
@@ -57,8 +61,13 @@ ibv_devinfo || true
5761
uname -a || true
5862

5963
echo "==== Running ETCD server ===="
60-
export NIXL_ETCD_ENDPOINTS="http://127.0.0.1:2379"
61-
etcd --listen-client-urls ${NIXL_ETCD_ENDPOINTS} --advertise-client-urls ${NIXL_ETCD_ENDPOINTS} &
64+
etcd_port=$(get_next_tcp_port)
65+
etcd_peer_port=$(get_next_tcp_port)
66+
export NIXL_ETCD_ENDPOINTS="http://127.0.0.1:${etcd_port}"
67+
export NIXL_ETCD_PEER_URLS="http://127.0.0.1:${etcd_peer_port}"
68+
etcd --listen-client-urls ${NIXL_ETCD_ENDPOINTS} --advertise-client-urls ${NIXL_ETCD_ENDPOINTS} \
69+
--listen-peer-urls ${NIXL_ETCD_PEER_URLS} --initial-advertise-peer-urls ${NIXL_ETCD_PEER_URLS} \
70+
--initial-cluster default=${NIXL_ETCD_PEER_URLS} &
6271
sleep 5
6372

6473
echo "==== Running C++ tests ===="
@@ -76,13 +85,17 @@ cd ${INSTALL_DIR}
7685

7786
./bin/ucx_backend_multi
7887
./bin/serdes_test
79-
./bin/gtest
88+
89+
# shellcheck disable=SC2154
90+
./bin/gtest --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port"
8091
./bin/test_plugin
8192

8293
# Run NIXL client-server test
83-
./bin/nixl_test target 127.0.0.1 1234&
94+
nixl_test_port=$(get_next_tcp_port)
95+
96+
./bin/nixl_test target 127.0.0.1 "$nixl_test_port"&
8497
sleep 1
85-
./bin/nixl_test initiator 127.0.0.1 1234
98+
./bin/nixl_test initiator 127.0.0.1 "$nixl_test_port"
8699

87100
echo "${TEXT_YELLOW}==== Disabled tests==="
88101
echo "./bin/md_streamer disabled"

.gitlab/test_plugins.sh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
# shellcheck disable=SC1091
18+
. "$(dirname "$0")/../.ci/scripts/common.sh"
19+
1720
set -e
1821
set -x
1922

@@ -45,4 +48,5 @@ uname -a || true
4548

4649
echo "==== Running Plugins Gtest tests ===="
4750
cd ${INSTALL_DIR}
48-
./bin/plugins_gtest
51+
# shellcheck disable=SC2154
52+
./bin/plugins_gtest --min-tcp-port="$min_gtest_port" --max-tcp-port="$max_gtest_port"

.gitlab/test_python.sh

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
# shellcheck disable=SC1091
18+
. "$(dirname "$0")/../.ci/scripts/common.sh"
19+
1720
set -e
1821
set -x
1922

@@ -52,8 +55,13 @@ pip3 install --break-system-packages pytest-timeout
5255
pip3 install --break-system-packages zmq
5356

5457
echo "==== Running ETCD server ===="
55-
export NIXL_ETCD_ENDPOINTS="http://127.0.0.1:2379"
56-
etcd --listen-client-urls ${NIXL_ETCD_ENDPOINTS} --advertise-client-urls ${NIXL_ETCD_ENDPOINTS} &
58+
etcd_port=$(get_next_tcp_port)
59+
etcd_peer_port=$(get_next_tcp_port)
60+
export NIXL_ETCD_ENDPOINTS="http://127.0.0.1:${etcd_port}"
61+
export NIXL_ETCD_PEER_URLS="http://127.0.0.1:${etcd_peer_port}"
62+
etcd --listen-client-urls ${NIXL_ETCD_ENDPOINTS} --advertise-client-urls ${NIXL_ETCD_ENDPOINTS} \
63+
--listen-peer-urls ${NIXL_ETCD_PEER_URLS} --initial-advertise-peer-urls ${NIXL_ETCD_PEER_URLS} \
64+
--initial-cluster default=${NIXL_ETCD_PEER_URLS} &
5765
sleep 5
5866

5967
echo "==== Running python tests ===="
@@ -66,10 +74,12 @@ python3 test/python/prep_xfer_perf.py list
6674
python3 test/python/prep_xfer_perf.py array
6775

6876
echo "==== Running python examples ===="
77+
blocking_send_recv_port=$(get_next_tcp_port)
78+
6979
cd examples/python
70-
python3 blocking_send_recv_example.py --mode="target" --ip=127.0.0.1 --port=1234&
80+
python3 blocking_send_recv_example.py --mode="target" --ip=127.0.0.1 --port="$blocking_send_recv_port"&
7181
sleep 5
72-
python3 blocking_send_recv_example.py --mode="initiator" --ip=127.0.0.1 --port=1234
82+
python3 blocking_send_recv_example.py --mode="initiator" --ip=127.0.0.1 --port="$blocking_send_recv_port"
7383

7484
python3 query_mem_example.py
7585

CODEOWNERS

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,17 @@ CODEOWNERS @ai-dynamo/Devops @ai-dynamo/nixl-maintainers
1414
/SECURITY.md @ai-dynamo/Devops
1515

1616
# Bindings
17-
/src/bindings/rust @roiedanino @gleon99
17+
/src/bindings/rust @roiedanino @gleon99 @mkhazraee
1818

1919
# UCX Plugins
2020
/src/plugins/ucx* @brminich @yosefe @gleon99
2121
/src/utils/ucx @brminich @yosefe @gleon99
2222

2323
# Storage Plugins
24-
/src/plugins/posix @w1ldptr @barneuman @etoledano
25-
/src/plugins/hf3fs @w1ldptr @barneuman @etoledano
26-
/src/plugins/cuda_gds @w1ldptr @barneuman @etoledano
27-
/src/plugins/obj @w1ldptr @barneuman @etoledano
24+
/src/plugins/posix @w1ldptr @barneuman @etoledano vvenkates27
25+
/src/plugins/hf3fs @w1ldptr @barneuman @etoledano vvenkates27
26+
/src/plugins/cuda_gds @w1ldptr @barneuman @etoledano vvenkates27
27+
/src/plugins/obj @w1ldptr @barneuman @etoledano vvenkates27
2828

2929
# Benchmarks
30-
/benchmark/nixlbench @aranadive @ovidiusm @brminich
30+
/benchmark @aranadive @ovidiusm @brminich

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmark/nixlbench/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,24 @@ Example:
125125
```
126126

127127
The workers automatically coordinate ranks through ETCD as they connect.
128+
129+
### Benchmarking the OBJ Plugin
130+
131+
For OBJ plugin benchmarking run etcd-server and a single nixlbench instance.
132+
133+
Example:
134+
```bash
135+
AWS_ACCESS_KEY_ID=<access_key> AWS_SECRET_ACCESS_KEY=<secret_key> AWS_DEFAULT_REGION=<region> /tmp/nixlbench/nixlbench --etcd-endpoints http://<etcd-server>:2379 --backend OBJ --obj_bucket_name <bucket_name>
136+
```
137+
Access key, secret access key, default region and bucket name are mandatory fields.
138+
Use your own valid credentials.
139+
140+
Transfer times are higher than local storage so it is advisable to use less iterations than the default values.
141+
142+
Example:
143+
```bash
144+
AWS_ACCESS_KEY_ID=<access_key> AWS_SECRET_ACCESS_KEY=<secret_key> AWS_DEFAULT_REGION=<region> /tmp/nixlbench/nixlbench --etcd-endpoints http://etcd-server:2379 --backend OBJ --obj_bucket_name nixl-ci-test --warmup_iter 32 --num_iter 32 --large_blk_iter_ftr 2
145+
```
146+
147+
The default benchmark command tests write. To test read ops add the flag: `--op_type READ`.
148+
To test tranfer data validity add the flag: `--check_consistency true`.

0 commit comments

Comments
 (0)