@@ -45,6 +45,7 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
4545 openssh-server \
4646 pkg-config \
4747 python3-distutils \
48+ libhwloc-dev \
4849 vim
4950RUN apt-get purge -y cuda-compat-*
5051
@@ -53,10 +54,6 @@ RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config &&
5354 echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
5455 sed -i 's/#\( StrictModes \) .*/\1 no/g' /etc/ssh/sshd_config
5556
56- # Set paths for both aarch64 and x86_64
57- ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
58- ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
59-
6057RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
6158 && python3 /tmp/get-pip.py \
6259 && pip3 install awscli pynvml
@@ -68,10 +65,10 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
6865# # that the cuda-compat-xx-x package is the latest.
6966RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
7067 && cd /tmp/gdrcopy \
71- && make prefix=/opt/gdrcopy install
68+ && make prefix=/opt/gdrcopy install \
69+ && echo "/opt/gdrcopy/lib" > /etc/ld.so.conf.d/000_gdrcopy.conf \
70+ && ldconfig
7271
73- ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
74- ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
7572ENV CPATH=/opt/gdrcopy/include:$CPATH
7673ENV PATH=/opt/gdrcopy/bin:$PATH
7774
@@ -82,14 +79,23 @@ RUN cd $HOME \
8279 && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
8380 && cd aws-efa-installer \
8481 && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
85- && rm -rf $HOME/aws-efa-installer
82+ && rm -rf $HOME/aws-efa-installer \
83+ && echo "/opt/amazon/openmpi/lib" > /etc/ld.so.conf.d/000_efa_ompi.conf \
84+ && ldconfig
85+
86+ # For ofi-nccl set paths for both aarch64 and x86_64
87+ ENV LD_LIBRARY_PATH=/opt/amazon/ofi-nccl/lib/aarch64-linux-gnu:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
88+
89+ ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:$PATH
8690
8791# ##################################################
8892# # Install NCCL
8993RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
9094 && cd /opt/nccl \
9195 && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
92- NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
96+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" \
97+ && echo "/opt/nccl/build/lib" > /etc/ld.so.conf.d/000_nccl.conf \
98+ && ldconfig
9399
94100# ##################################################
95101# # Install NCCL-tests
@@ -102,6 +108,25 @@ RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git
102108 NCCL_HOME=/opt/nccl/build \
103109 NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
104110
111+ # ##################################################
112+ # # Install AWS OFI NCCL
113+ RUN git clone -b ${AWS_OFI_NCCL_VERSION} https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl && \
114+ cd /opt/aws-ofi-nccl && \
115+ ./autogen.sh && \
116+ ./configure \
117+ --with-libfabric=/opt/amazon/efa \
118+ --prefix=/opt/aws-ofi-nccl/build \
119+ --with-nccl=/opt/nccl/build \
120+ --with-mpi=/opt/amazon/openmpi \
121+ --enable-platform-aws \
122+ --with-cuda=/usr/local/cuda \
123+ --enable-cudart-dynamic \
124+ --disable-tests \
125+ --without-lttng \
126+ --without-valgrind \
127+ --disable-werror && \
128+ make -j $(nproc) && make install
129+
105130RUN rm -rf /var/lib/apt/lists/*
106131
107132# # Set Open MPI variables to exclude network interface and conduit.
0 commit comments