22# SPDX-License-Identifier: MIT-0
33FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
44
5+ ARG GDRCOPY_VERSION=v2.4.1
56ARG EFA_INSTALLER_VERSION=1.31.0
67ARG AWS_OFI_NCCL_VERSION=v1.8.1-aws
7- ARG NCCL_TESTS_VERSION=2.13.9
8- ARG NCCL_VERSION=2.20.3
8+ ARG NCCL_VERSION=v2.20.3-1
9+ ARG NCCL_TESTS_VERSION=v2.13.9
910
1011RUN apt-get update -y
1112RUN apt-get remove -y --allow-change-held-packages \
12- libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev
13+ ibverbs-utils \
14+ libibverbs-dev \
15+ libibverbs1 \
16+ libmlx5-1 \
17+ libnccl2 \
18+ libnccl-dev
1319
1420RUN rm -rf /opt/hpcx \
1521 && rm -rf /usr/local/mpi \
1622 && rm -f /etc/ld.so.conf.d/hpcx.conf \
1723 && ldconfig
24+
1825ENV OPAL_PREFIX=
1926
2027RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
28+ apt-utils \
29+ autoconf \
30+ automake \
31+ build-essential \
32+ check \
33+ cmake \
34+ curl \
35+ debhelper \
36+ devscripts \
2137 git \
2238 gcc \
23- vim \
39+ gdb \
2440 kmod \
41+ libsubunit-dev \
42+ libtool \
2543 openssh-client \
2644 openssh-server \
27- build-essential \
28- curl \
29- autoconf \
30- libtool \
31- gdb \
32- automake \
45+ pkg-config \
3346 python3-distutils \
34- cmake \
35- apt-utils \
36- devscripts \
37- debhelper \
38- libsubunit-dev \
39- check \
40- pkg-config
47+ vim
4148
4249RUN mkdir -p /var/run/sshd
4350RUN sed -i 's/[ #]\( .*StrictHostKeyChecking \) .*/ \1 no/g' /etc/ssh/ssh_config && \
4451 echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
4552 sed -i 's/#\( StrictModes \) .*/\1 no/g' /etc/ssh/sshd_config
53+
4654ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
4755ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
4856
@@ -52,12 +60,14 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
5260
5361# ################################################
5462# # Install NVIDIA GDRCopy
55- # RUN git clone https://github.com/NVIDIA/gdrcopy.git /opt/gdrcopy \
56- # && cd /opt/gdrcopy \
57- # && make lib_install install \
58- # && cd /opt/gdrcopy/tests \
59- # && make \
60- # && mv copylat copybw sanity apiperf /usr/bin/
63+ RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
64+ && cd /tmp/gdrcopy \
65+ && make prefix=/opt/gdrcopy install
66+
67+ ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
68+ ENV LIBRARY_PATH /opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
69+ ENV CPATH /opt/gdrcopy/include:$CPATH
70+ ENV PATH /opt/gdrcopy/bin:$PATH
6171
6272# ################################################
6373# # Install EFA installer
@@ -70,36 +80,50 @@ RUN cd $HOME \
7080
7181# ##################################################
7282# # Install NCCL
73- RUN git clone -b v ${NCCL_VERSION}-1 https://github.com/NVIDIA/nccl.git /opt/nccl \
83+ RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \
7484 && cd /opt/nccl \
7585 && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
76- NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
86+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch= compute_90,code=sm_90"
7787
7888# ##################################################
7989# # Install AWS-OFI-NCCL plugin
80- RUN apt-get install libtool autoconf cmake nasm unzip pigz parallel nfs-common build-essential hwloc libhwloc-dev libjemalloc2 libnuma-dev numactl libjemalloc-dev preload htop iftop liblapack-dev libgfortran5 ipcalc wget curl devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms -y
81- RUN export OPAL_PREFIX="" \
82- && git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
83- && cd /opt/aws-ofi-nccl \
84- && git checkout ${AWS_OFI_NCCL_VERSION} \
85- && ./autogen.sh \
90+ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev
91+ RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
92+ && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \
93+ && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
8694 && ./configure --prefix=/opt/aws-ofi-nccl/install \
8795 --with-mpi=/opt/amazon/openmpi \
8896 --with-libfabric=/opt/amazon/efa \
8997 --with-cuda=/usr/local/cuda \
9098 --enable-platform-aws \
91- && make -j $(nproc) && make install
99+ && make -j $(nproc) \
100+ && make install \
101+ && cd .. \
102+ && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \
103+ && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz
92104
93105# ##################################################
94106# # Install NCCL-tests
95- RUN git clone -b v ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
107+ RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
96108 && cd /opt/nccl-tests \
97109 && make -j $(nproc) \
98110 MPI=1 \
99111 MPI_HOME=/opt/amazon/openmpi/ \
100112 CUDA_HOME=/usr/local/cuda \
101113 NCCL_HOME=/opt/nccl/build \
102- NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_90,code=sm_90"
114+ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch= compute_90,code=sm_90"
103115
104116RUN rm -rf /var/lib/apt/lists/*
117+
118+ # # Set Open MPI variables to exclude network interface and conduit.
119+ ENV OMPI_MCA_pml=^cm,ucx \
120+ OMPI_MCA_btl=tcp,self \
121+ OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
122+ OPAL_PREFIX=/opt/amazon/openmpi \
123+ NCCL_SOCKET_IFNAME=^docker,lo
124+
125+ # # Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
126+ ENV PMIX_MCA_gds=hash
127+
128+ # # Set LD_PRELOAD for NCCL library
105129ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so
0 commit comments