From 703bcc4cd5bfaf68197fabcd73c1d4e46ebf24fb Mon Sep 17 00:00:00 2001 From: Daniel Huang Date: Tue, 9 Dec 2025 16:23:31 -0800 Subject: [PATCH 1/2] Add setup nixl ucx script Signed-off-by: Daniel Huang --- setup_nixl_ucx.sh | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100755 setup_nixl_ucx.sh diff --git a/setup_nixl_ucx.sh b/setup_nixl_ucx.sh new file mode 100755 index 00000000000..f52a90e510c --- /dev/null +++ b/setup_nixl_ucx.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -e + +UCX_DIR=${UCX_DIR:-"/tmp/ucx_source"} +NIXL_DIR=${NIXL_DIR:-"/tmp/nixl_source"} +UCX_INSTALL_DIR=${UCX_INSTALL_DIR:-"/tmp/ucx_install"} + +UCX_REPO_URL="https://github.com/intel-staging/ucx.git" +UCX_BRANCH="intel_gaudi_gdr_enabling_0" +NIXL_REPO_URL="https://github.com/ai-dynamo/nixl.git" +NIXL_BRANCH="0.7.0" + +# Device specific configuration +if command -v nvidia-smi >/dev/null 2>&1; then + # CUDA configuration + with_gaudi=no + ucx_config_extra_kwargs="--with-cuda=/usr/local/cuda" +elif command -v hl-smi >/dev/null 2>&1; then + # HPU configuration + with_gaudi=yes + ucx_config_extra_kwargs= +else + echo "Unknown device, aborting install." + exit 1 +fi + +echo "UCX_DIR: $UCX_DIR" +echo "NIXL_DIR: $NIXL_DIR" + +echo "Installing prerequisites" +apt-get update +yes | apt install build-essential cmake pkg-config meson ninja-build autoconf libtool libcjson-dev libaio-dev pybind11-dev + +echo "Installing UCX ($UCX_BRANCH) to $UCX_INSTALL_DIR" +ucx_root=$(dirname "$UCX_DIR") +mkdir -p "$ucx_root" +[[ -d $UCX_DIR ]] || git clone -b "$UCX_BRANCH" "$UCX_REPO_URL" "$UCX_DIR" +cd "$UCX_DIR" +./autogen.sh +./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=$with_gaudi --enable-examples --enable-mt $ucx_config_extra_kwargs +make -j8 && make -j install-strip && ldconfig + +echo "Installing NIXL ($NIXL_BRANCH) to $NIXL_DIR" +nixl_root=$(dirname "$NIXL_DIR") +mkdir -p "$nixl_root" +[[ -d $NIXL_DIR ]] || git clone -b "$NIXL_BRANCH" "$NIXL_REPO_URL" "$NIXL_DIR" +cd "$NIXL_DIR" +meson setup --reconfigure build -Ducx_path="$UCX_INSTALL_DIR" -Dinstall_headers=true -Ddisable_gds_backend=false +sed -i "s|\(option('ucx_path', type: 'string', value: \)'[^']*|\1'$UCX_INSTALL_DIR|" "$NIXL_DIR/meson_options.txt" +cd build +ninja && ninja install + +pip install "$NIXL_DIR" + +echo "Completed nixl install" +echo "" +echo "Set these env vars after installing: " +echo 'export UCX_MEMTYPE_CACHE=0' +echo 'export LD_LIBRARY_PATH="/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"' +echo 'export LD_LIBRARY_PATH="${UCX_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}"' +echo ' e.g. export LD_LIBRARY_PATH="/tmp/ucx_install/lib:${LD_LIBRARY_PATH}"' From 77b4934399052e875cfe38a3a3b2f3eb508280c5 Mon Sep 17 00:00:00 2001 From: Daniel Huang Date: Wed, 17 Dec 2025 14:20:06 -0800 Subject: [PATCH 2/2] Clearer structures, updated deps Signed-off-by: Daniel Huang --- setup_nixl_ucx.sh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/setup_nixl_ucx.sh b/setup_nixl_ucx.sh index f52a90e510c..b228728b418 100755 --- a/setup_nixl_ucx.sh +++ b/setup_nixl_ucx.sh @@ -13,13 +13,9 @@ NIXL_BRANCH="0.7.0" # Device specific configuration if command -v nvidia-smi >/dev/null 2>&1; then - # CUDA configuration - with_gaudi=no - ucx_config_extra_kwargs="--with-cuda=/usr/local/cuda" + DEVICE="cuda" elif command -v hl-smi >/dev/null 2>&1; then - # HPU configuration - with_gaudi=yes - ucx_config_extra_kwargs= + DEVICE="hpu" else echo "Unknown device, aborting install." exit 1 @@ -30,7 +26,8 @@ echo "NIXL_DIR: $NIXL_DIR" echo "Installing prerequisites" apt-get update -yes | apt install build-essential cmake pkg-config meson ninja-build autoconf libtool libcjson-dev libaio-dev pybind11-dev +apt install -y build-essential cmake libibverbs1 libibverbs-dev librdmacm1 librdmacm-dev rdma-core \ + pkg-config meson ninja-build autoconf libtool libcjson-dev libaio-dev pybind11-dev echo "Installing UCX ($UCX_BRANCH) to $UCX_INSTALL_DIR" ucx_root=$(dirname "$UCX_DIR") @@ -38,8 +35,12 @@ mkdir -p "$ucx_root" [[ -d $UCX_DIR ]] || git clone -b "$UCX_BRANCH" "$UCX_REPO_URL" "$UCX_DIR" cd "$UCX_DIR" ./autogen.sh -./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=$with_gaudi --enable-examples --enable-mt $ucx_config_extra_kwargs -make -j8 && make -j install-strip && ldconfig +if [ "$DEVICE" == "hpu" ]; then + ./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=yes --enable-examples --enable-mt +else + ./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=no --enable-examples --enable-mt --with-cuda=/usr/local/cuda +fi +make -j 8 && make -j install-strip && ldconfig echo "Installing NIXL ($NIXL_BRANCH) to $NIXL_DIR" nixl_root=$(dirname "$NIXL_DIR")