Skip to content

Commit 8d9c95e

Browse files
authored
nccl-tests/Dockerfile - Add support for custom aws-ofi-nccl & cleanup (#881)
* Custom aws-ofi-nccl support * Cleanup LD_LIBRARY_PATH and PATH in favor of /etc/ld.so.conf.d
1 parent d69470b commit 8d9c95e

File tree

1 file changed

+34
-9
lines changed

1 file changed

+34
-9
lines changed

micro-benchmarks/nccl-tests/nccl-tests.Dockerfile

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
4545
openssh-server \
4646
pkg-config \
4747
python3-distutils \
48+
libhwloc-dev \
4849
vim
4950
RUN apt-get purge -y cuda-compat-*
5051

@@ -53,10 +54,6 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config &&
5354
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
5455
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
5556

56-
# Set paths for both aarch64 and x86_64
57-
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
58-
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
59-
6057
RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
6158
&& python3 /tmp/get-pip.py \
6259
&& pip3 install awscli pynvml
@@ -68,10 +65,10 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
6865
## that the cuda-compat-xx-x package is the latest.
6966
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
7067
&& cd /tmp/gdrcopy \
71-
&& make prefix=/opt/gdrcopy install
68+
&& make prefix=/opt/gdrcopy install \
69+
&& echo "/opt/gdrcopy/lib" > /etc/ld.so.conf.d/000_gdrcopy.conf \
70+
&& ldconfig
7271

73-
ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
74-
ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
7572
ENV CPATH=/opt/gdrcopy/include:$CPATH
7673
ENV PATH=/opt/gdrcopy/bin:$PATH
7774

@@ -82,14 +79,23 @@ RUN cd $HOME \
8279
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
8380
&& cd aws-efa-installer \
8481
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
85-
&& rm -rf $HOME/aws-efa-installer
82+
&& rm -rf $HOME/aws-efa-installer \
83+
&& echo "/opt/amazon/openmpi/lib" > /etc/ld.so.conf.d/000_efa_ompi.conf \
84+
&& ldconfig
85+
86+
# For ofi-nccl set paths for both aarch64 and x86_64
87+
ENV LD_LIBRARY_PATH=/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
88+
89+
ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH
8690

8791
###################################################
8892
## Install NCCL
8993
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
9094
&& cd /opt/nccl \
9195
&& make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
92-
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
96+
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" \
97+
&& echo "/opt/nccl/build/lib" > /etc/ld.so.conf.d/000_nccl.conf \
98+
&& ldconfig
9399

94100
###################################################
95101
## Install NCCL-tests
@@ -102,6 +108,25 @@ RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git
102108
NCCL_HOME=/opt/nccl/build \
103109
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
104110

111+
###################################################
112+
## Install AWS OFI NCCL
113+
RUN git clone -b ${AWS_OFI_NCCL_VERSION} https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
114+
cd /opt/aws-ofi-nccl && \
115+
./autogen.sh && \
116+
./configure \
117+
--with-libfabric=/opt/amazon/efa \
118+
--prefix=/opt/aws-ofi-nccl/build \
119+
--with-nccl=/opt/nccl/build \
120+
--with-mpi=/opt/amazon/openmpi \
121+
--enable-platform-aws \
122+
--with-cuda=/usr/local/cuda \
123+
--enable-cudart-dynamic \
124+
--disable-tests \
125+
--without-lttng \
126+
--without-valgrind \
127+
--disable-werror && \
128+
make -j $(nproc) && make install
129+
105130
RUN rm -rf /var/lib/apt/lists/*
106131

107132
## Set Open MPI variables to exclude network interface and conduit.

0 commit comments

Comments
 (0)