diff --git a/micro-benchmarks/nccl-tests/README.md b/micro-benchmarks/nccl-tests/README.md index d50d5360f..c126d0623 100644 --- a/micro-benchmarks/nccl-tests/README.md +++ b/micro-benchmarks/nccl-tests/README.md @@ -68,6 +68,7 @@ If you wish to build the containar image by yourself, follow this section. Alter -t ${CONTAINER_IMAGE_NAME_TAG} \ . ``` + Note: If you are using an arm64 platform (like p6e-gb200.36xlarge, or any ultraserver comprised of that instance, for example), pass in `--platform=linux/arm64` into the `docker build` command above. Or, just build your image directly on an `arm64` based instance! 1. Once the container image is prepared, you can check if it is present with `docker images`. You should see an output similar to this one: ``` diff --git a/micro-benchmarks/nccl-tests/nccl-tests-gb200.Dockerfile b/micro-benchmarks/nccl-tests/nccl-tests-gb200.Dockerfile deleted file mode 100644 index 1fd2f4e1c..000000000 --- a/micro-benchmarks/nccl-tests/nccl-tests-gb200.Dockerfile +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: MIT-0 -ARG CUDA_VERSION=12.8.1 -FROM --platform=linux/arm64 nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 - -ARG GDRCOPY_VERSION=v2.4.4 -ARG EFA_INSTALLER_VERSION=1.41.0 -ARG AWS_OFI_NCCL_VERSION=v1.14.2 -ARG NCCL_VERSION=v2.27.3-1 -ARG NCCL_TESTS_VERSION=v2.15.2 - -RUN apt-get update -y && apt-get upgrade -y -RUN apt-get remove -y --allow-change-held-packages \ - ibverbs-utils \ - libibverbs-dev \ - libibverbs1 \ - libmlx5-1 \ - libnccl2 \ - libnccl-dev - -RUN rm -rf /opt/hpcx \ - && rm -rf /usr/local/mpi \ - && rm -f /etc/ld.so.conf.d/hpcx.conf \ - && ldconfig - -ENV OPAL_PREFIX= - -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ - apt-utils \ - autoconf \ - automake \ - build-essential \ - check \ - cmake \ - curl \ - debhelper \ - devscripts \ - git \ - gcc \ - gdb \ - kmod \ - libsubunit-dev \ - libtool \ - openssh-client \ - openssh-server \ - pkg-config \ - python3-distutils \ - vim -RUN apt-get purge -y cuda-compat-* - -RUN mkdir -p /var/run/sshd -RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ - echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ - sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config - -ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH -ENV PATH /opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH - -RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \ - && python3 /tmp/get-pip.py \ - && pip3 install awscli pynvml - -################################################# -## Install NVIDIA GDRCopy -## -## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure -## that the cuda-compat-xx-x package is the latest. -RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \ - && cd /tmp/gdrcopy \ - && make prefix=/opt/gdrcopy install - -ENV LD_LIBRARY_PATH /opt/gdrcopy/lib:$LD_LIBRARY_PATH -ENV LIBRARY_PATH /opt/gdrcopy/lib:$LIBRARY_PATH -ENV CPATH /opt/gdrcopy/include:$CPATH -ENV PATH /opt/gdrcopy/bin:$PATH - -################################################# -## Install EFA installer -RUN cd $HOME \ - && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ - && cd aws-efa-installer \ - && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ - && rm -rf $HOME/aws-efa-installer - -################################################### -## Install NCCL -RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl \ - && cd /opt/nccl \ - && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \ - NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" - -################################################### -## Install AWS-OFI-NCCL plugin -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y libhwloc-dev -#Switch from sh to bash to allow parameter expansion -SHELL ["/bin/bash", "-c"] -RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz \ - && cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && ./configure --prefix=/opt/aws-ofi-nccl/install \ - --with-mpi=/opt/amazon/openmpi \ - --with-libfabric=/opt/amazon/efa \ - --with-cuda=/usr/local/cuda \ - --enable-platform-aws \ - && make -j $(nproc) \ - && make install \ - && cd .. \ - && rm -rf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v} \ - && rm aws-ofi-nccl-${AWS_OFI_NCCL_VERSION//v}.tar.gz - -SHELL ["/bin/sh", "-c"] - -################################################### -## Install NCCL-tests -RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ - && cd /opt/nccl-tests \ - && make -j $(nproc) \ - MPI=1 \ - MPI_HOME=/opt/amazon/openmpi/ \ - CUDA_HOME=/usr/local/cuda \ - NCCL_HOME=/opt/nccl/build \ - NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100" - -RUN rm -rf /var/lib/apt/lists/* - -## Set Open MPI variables to exclude network interface and conduit. -ENV OMPI_MCA_pml=^ucx \ - OMPI_MCA_btl=tcp,self \ - OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\ - OPAL_PREFIX=/opt/amazon/openmpi \ - NCCL_SOCKET_IFNAME=^docker,lo,veth - -## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516 -ENV PMIX_MCA_gds=hash - -## Set LD_PRELOAD for NCCL library -ENV LD_PRELOAD /opt/nccl/build/lib/libnccl.so