Skip to content

Commit cb5e678

Browse files
committed
Unify docker images and build base in CI
Signed-off-by: Daniel Pressler <[email protected]>
1 parent e9182b8 commit cb5e678

File tree

2 files changed

+252
-23
lines changed

2 files changed

+252
-23
lines changed

.ci/jenkins/lib/build-matrix.yaml

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,57 +30,102 @@ timeout_minutes: 240
3030
kubernetes:
3131
cloud: il-ipp-blossom-prod
3232
namespace: swx-media
33-
limits: "{memory: 8Gi, cpu: 8000m}"
34-
requests: "{memory: 8Gi, cpu: 8000m}"
33+
limits: '{memory: 10Gi, cpu: 10000m}'
34+
requests: '{memory: 10Gi, cpu: 10000m}'
35+
arch_table:
36+
x86_64:
37+
nodeSelector: 'kubernetes.io/arch=amd64'
38+
jnlpImage: 'harbor.mellanox.com/toolbox/c3po-jnlp:latest'
39+
aarch64:
40+
nodeSelector: 'kubernetes.io/arch=arm64'
41+
jnlpImage: 'harbor.mellanox.com/toolbox/c3po-jnlp:latest'
42+
43+
# =============================================================================
44+
# Container Registry Configuration
45+
# =============================================================================
46+
# Harbor registry for storing build artifacts and base images
47+
registry_host: harbor.mellanox.com
48+
registry_path: /swx-infra/nixl
49+
registry_auth: swx-infra_harbor_credentials
3550

3651
runs_on_dockers:
37-
- { name: "ubuntu24.04-pytorch", url: "nvcr.io/nvidia/pytorch:25.02-py3" }
38-
- { name: "ubuntu22.04-pytorch", url: "nvcr.io/nvidia/pytorch:24.10-py3" }
39-
- { name: "podman-v5.0.2", url: "quay.io/podman/stable:v5.0.2", category: 'tool', privileged: true }
52+
- {
53+
url: 'harbor.mellanox.com/swx-infra/nixl/ci/$arch/nixl-base-ubuntu24.04:20250701',
54+
# file: 'contrib/dockerfiles/Dockerfile',
55+
name: 'ubuntu24.04',
56+
uri: 'ci/$arch/nixl-base-$name',
57+
tag: '20250701',
58+
build_args: '--no-cache --target nixl-base --build-arg OS=ubuntu24 --build-arg BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base --build-arg BASE_IMAGE_TAG=25.03-cuda12.8-devel-ubuntu24.04 --build-arg UCX_PREFIX=$UCX_PREFIX --build-arg NPROC=$NPROC --build-arg ARCH=$arch'
59+
}
60+
- {
61+
# url: 'harbor.mellanox.com/swx-infra/nixl/ci/$arch/nixl-base-ubuntu22.04:20250701',
62+
file: 'contrib/dockerfiles/Dockerfile',
63+
name: 'ubuntu22.04',
64+
uri: 'ci/$arch/nixl-base-$name',
65+
tag: '20250701',
66+
build_args: '--no-cache --target nixl-base --build-arg OS=ubuntu22 --build-arg BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base --build-arg BASE_IMAGE_TAG=24.10-cuda12.6-devel-ubuntu22.04 --build-arg UCX_PREFIX=$UCX_PREFIX --build-arg NPROC=$NPROC --build-arg ARCH=$arch'
67+
}
4068

4169
matrix:
4270
axes:
4371
arch:
4472
- x86_64
45-
- aarch64
73+
# - aarch64
74+
75+
taskName: "${name}/${arch}"
4676

4777
env:
4878
NIXL_INSTALL_DIR: /opt/nixl
49-
NPROC: "16"
79+
UCX_PREFIX: /usr
80+
TEST_TIMEOUT: 30
81+
NPROC: 16
5082

5183
steps:
5284
- name: Build
5385
parallel: false
5486
run: |
55-
if [[ "${name}" == *"ubuntu22.04"* ]]; then
56-
# distro's meson version is too old project requires >= 0.64.0
57-
pip3 install meson
58-
fi
59-
.gitlab/build.sh ${NIXL_INSTALL_DIR}
87+
export LIBRARY_PATH="$LIBRARY_PATH:/usr/local/cuda/lib64"
88+
export LD_LIBRARY_PATH="${NIXL_INSTALL_DIR}/lib:${NIXL_INSTALL_DIR}/lib/$ARCH-linux-gnu:${NIXL_INSTALL_DIR}/lib64:$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${NIXL_INSTALL_DIR}/lib"
89+
export CPATH="${NIXL_INSTALL_DIR}/include:$CPATH"
90+
export PATH="${NIXL_INSTALL_DIR}/bin:$PATH"
91+
export PKG_CONFIG_PATH="${NIXL_INSTALL_DIR}/lib/pkgconfig:${NIXL_INSTALL_DIR}/lib64/pkgconfig:${NIXL_INSTALL_DIR}:$PKG_CONFIG_PATH"
92+
export NIXL_PLUGIN_DIR="${NIXL_INSTALL_DIR}/lib/$ARCH-linux-gnu/plugins"
93+
export CMAKE_PREFIX_PATH="${NIXL_INSTALL_DIR}:${CMAKE_PREFIX_PATH}"
94+
95+
# Disabling CUDA IPC not to use NVLINK, as it slows down local
96+
# UCX transfers and can cause contention with local collectives.
97+
export UCX_TLS=^cuda_ipc
98+
99+
meson setup nixl_build --prefix=${NIXL_INSTALL_DIR} -Ducx_path=${UCX_PREFIX} -Drust=false
100+
ninja -C nixl_build -j${NPROC}
101+
ninja -C nixl_build install
60102
61103
- name: Test CPP
62104
parallel: false
105+
timeout: "${TEST_TIMEOUT}"
63106
run: |
64107
.gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}
65108
66109
- name: Test Python
67110
parallel: false
111+
timeout: "${TEST_TIMEOUT}"
68112
run: |
69113
.gitlab/test_python.sh ${NIXL_INSTALL_DIR}
70114
71115
- name: Test Rust
72116
parallel: false
117+
timeout: "${TEST_TIMEOUT}"
73118
run: |
74119
.gitlab/test_rust.sh ${NIXL_INSTALL_DIR}
75120
76-
- name: Build Docker Image
77-
parallel: false
78-
containerSelector: "{ name: 'podman.*' }"
79-
run: |
80-
# change storage driver to improve build performance
81-
rm -f /etc/containers/storage.conf ; podman system reset -f || true
82-
# symlink podman to docker - scripts works with docker commands
83-
ln -sfT $(type -p podman) /usr/bin/docker
84-
# install git for building container image
85-
yum install -y git
86-
contrib/build-container.sh --no-cache
121+
# - name: Build Docker Image
122+
# parallel: false
123+
# containerSelector: "{ name: 'podman.*' }"
124+
# run: |
125+
# # change storage driver to improve build performance
126+
# rm -f /etc/containers/storage.conf ; podman system reset -f || true
127+
# # symlink podman to docker - scripts works with docker commands
128+
# ln -sfT $(type -p podman) /usr/bin/docker
129+
# # install git for building container image
130+
# yum install -y git
131+
# contrib/build-container.sh --no-cache

contrib/dockerfiles/Dockerfile

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
17+
ARG BASE_IMAGE_TAG="25.03-cuda12.8-devel-ubuntu24.04"
18+
19+
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} as nixl-base
20+
21+
# Set default OS if not provided
22+
ARG OS="ubuntu24"
23+
ARG ARCH="x86_64"
24+
ARG DEFAULT_PYTHON_VERSION="3.12"
25+
ARG UCX_REF="v1.19.x"
26+
ARG UCX_PREFIX="/usr"
27+
ARG UCX_PLUGIN_DIR="$UCX_PREFIX/lib/ucx"
28+
ARG NIXL_PREFIX="/usr/local/nixl"
29+
ARG NIXL_PLUGIN_DIR="$NIXL_PREFIX/lib/$ARCH-linux-gnu/plugins"
30+
ARG NPROC="10"
31+
32+
RUN PY_VER=$(if [ "${OS}" = "ubuntu22" ]; then echo "3.10"; elif [ "${OS}" = "ubuntu24" ]; then echo "3.12"; else echo "3.12"; fi) && \
33+
apt-get update -y && \
34+
apt-get install -y ubuntu-keyring && \
35+
apt-get update -y && \
36+
DEBIAN_FRONTEND=noninteractive apt-get -y install \
37+
automake \
38+
autotools-dev \
39+
build-essential \
40+
clang \
41+
cmake \
42+
etcd-client \
43+
etcd-server \
44+
flex \
45+
libaio-dev \
46+
libclang-dev \
47+
libcpprest-dev \
48+
libcurl4-openssl-dev \
49+
libgflags-dev \
50+
libgtest-dev \
51+
libgrpc++-dev \
52+
libgrpc-dev \
53+
libprotobuf-dev \
54+
libssl-dev \
55+
libtool \
56+
liburing-dev \
57+
libz-dev \
58+
ninja-build \
59+
pybind11-dev \
60+
protobuf-compiler-grpc \
61+
python${PY_VER}-dev \
62+
python3-pip \
63+
uuid-dev \
64+
zlib1g-dev
65+
66+
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install \
67+
--reinstall libibverbs-dev rdma-core ibverbs-utils libibumad-dev \
68+
libnuma-dev librdmacm-dev ibverbs-providers
69+
70+
WORKDIR /workspace
71+
RUN git clone --depth 1 https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \
72+
cd etcd-cpp-apiv3 && \
73+
sed -i '/^find_dependency(cpprestsdk)$/d' etcd-cpp-api-config.in.cmake && \
74+
mkdir build && cd build && \
75+
cmake .. -DBUILD_ETCD_CORE_ONLY=ON -DCMAKE_BUILD_TYPE=Release && make -j${NPROC:-$(nproc)} && make install
76+
77+
RUN git clone --recurse-submodules --depth 1 --shallow-submodules https://github.com/aws/aws-sdk-cpp.git --branch 1.11.581 && \
78+
mkdir aws_sdk_build && cd aws_sdk_build && \
79+
cmake ../aws-sdk-cpp/ -DCMAKE_BUILD_TYPE=Release -DBUILD_ONLY="s3" -DENABLE_TESTING=OFF -DCMAKE_INSTALL_PREFIX=/usr/local && \
80+
make -j${NPROC:-$(nproc)} && make install
81+
82+
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
83+
84+
ENV RUSTUP_HOME=/usr/local/rustup \
85+
CARGO_HOME=/usr/local/cargo \
86+
PATH=/usr/local/cargo/bin:$PATH \
87+
RUST_VERSION=1.86.0 \
88+
RUSTARCH=${ARCH}-unknown-linux-gnu
89+
90+
# Download rustup-init and its checksum for the target architecture
91+
RUN wget --tries=3 --waitretry=5 \
92+
"https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init" \
93+
"https://static.rust-lang.org/rustup/archive/1.28.1/${RUSTARCH}/rustup-init.sha256" && \
94+
sha256sum -c rustup-init.sha256 && \
95+
chmod +x rustup-init && \
96+
./rustup-init -y --no-modify-path --profile minimal --default-toolchain $RUST_VERSION --default-host ${RUSTARCH} && \
97+
rm rustup-init* && \
98+
chmod -R a+w $RUSTUP_HOME $CARGO_HOME
99+
100+
# Add Mellanox repository and install packages
101+
RUN ARCH_SUFFIX=$(if [ "${ARCH}" = "aarch64" ]; then echo "arm64-sbsa"; else echo "${ARCH}"; fi) && \
102+
# Map OS names to Mellanox repository format
103+
MELLANOX_OS=$(if [ "${OS}" = "ubuntu22" ]; then echo "ubuntu22.04"; elif [ "${OS}" = "ubuntu24" ]; then echo "ubuntu24.04"; else echo "${OS}"; fi) && \
104+
export PKG_CONFIG_PATH="/opt/mellanox/doca/lib/${ARCH_SUFFIX}-linux-gnu/pkgconfig:/opt/mellanox/dpdk/lib/${ARCH_SUFFIX}-linux-gnu/pkgconfig:$PKG_CONFIG_PATH" && \
105+
curl -fsSL https://linux.mellanox.com/public/repo/doca/3.0.0/${MELLANOX_OS}/${ARCH_SUFFIX}/GPG-KEY-Mellanox.pub | \
106+
gpg --dearmor | tee /usr/share/keyrings/mellanox-archive-keyring.gpg && \
107+
echo "deb [signed-by=/usr/share/keyrings/mellanox-archive-keyring.gpg] https://linux.mellanox.com/public/repo/doca/3.0.0/${MELLANOX_OS}/${ARCH_SUFFIX} ./" | \
108+
tee /etc/apt/sources.list.d/mellanox.list && \
109+
apt-get update -y && \
110+
apt-get install -y --no-install-recommends \
111+
mlnx-dpdk mlnx-dpdk-dev \
112+
doca-sdk-common doca-sdk-dma doca-sdk-dpdk-bridge \
113+
doca-sdk-eth doca-sdk-flow doca-sdk-rdma doca-all
114+
115+
RUN if [ "$OS" = "ubuntu24" ]; then \
116+
apt-get install -y --no-install-recommends \
117+
doca-sdk-gpunetio libdoca-sdk-gpunetio-dev; \
118+
fi
119+
120+
RUN rm -rf /usr/lib/ucx /opt/hpcx/ucx && \
121+
cd /usr/local/src && \
122+
git clone https://github.com/openucx/ucx.git && \
123+
cd ucx && \
124+
git checkout $UCX_REF && \
125+
./autogen.sh && ./configure \
126+
--prefix=$UCX_PREFIX \
127+
--enable-shared \
128+
--disable-static \
129+
--disable-doxygen-doc \
130+
--enable-optimizations \
131+
--enable-cma \
132+
--enable-devel-headers \
133+
--with-cuda=/usr/local/cuda \
134+
--with-verbs \
135+
--with-dm \
136+
--with-gdrcopy=/usr/local \
137+
--with-efa \
138+
--enable-mt && \
139+
make -j${NPROC:-$(nproc)} && \
140+
make -j${NPROC:-$(nproc)} install-strip && \
141+
ldconfig
142+
143+
RUN if [ "$OS" = "ubuntu22" ]; then \
144+
pip --no-cache-dir install --upgrade pip "meson>=0.64.0"; \
145+
fi
146+
147+
FROM nixl-base as nixl-build
148+
149+
WORKDIR /workspace/nixl
150+
COPY . /workspace/nixl
151+
152+
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
153+
154+
ENV VIRTUAL_ENV=/workspace/nixl/.venv
155+
RUN rm -rf $VIRTUAL_ENV && uv venv $VIRTUAL_ENV --python $DEFAULT_PYTHON_VERSION && \
156+
# pybind11 pip install needed for ubuntu 22.04
157+
uv pip install --upgrade "meson>=0.64.0" pybind11 patchelf
158+
159+
# Install pybind11 via apt
160+
# RUN apt-get update && apt-get install -y --no-install-recommends pybind11-dev
161+
162+
ENV NIXL_PREFIX=$NIXL_PREFIX
163+
RUN rm -rf build && \
164+
mkdir build && \
165+
uv run meson setup build/ --prefix=$NIXL_PREFIX && \
166+
cd build && \
167+
ninja && \
168+
ninja install
169+
170+
RUN echo "$NIXL_PREFIX/lib/$ARCH-linux-gnu" > /etc/ld.so.conf.d/nixl.conf && \
171+
echo "$NIXL_PLUGIN_DIR" >> /etc/ld.so.conf.d/nixl.conf && \
172+
ldconfig
173+
174+
RUN cd src/bindings/rust && cargo build --release --locked
175+
176+
# Build wheel using the build-wheel.sh script for better UCX plugin bundling and library management
177+
RUN ./contrib/build-wheel.sh \
178+
--python-version $DEFAULT_PYTHON_VERSION \
179+
--platform manylinux_2_39_$ARCH \
180+
--ucx-plugins-dir $UCX_PLUGIN_DIR \
181+
--nixl-plugins-dir $NIXL_PLUGIN_DIR \
182+
--output-dir /workspace/nixl/dist
183+
184+
RUN uv pip install dist/nixl-*cp${DEFAULT_PYTHON_VERSION//./}*.whl

0 commit comments

Comments
 (0)