|
19 | 19 | # # Load image to local docker registry -> on head node, or new compute/build node. |
20 | 20 | # docker load < /fsx/nvidia-pt-od__latest.tar |
21 | 21 | #################################################################################################### |
22 | | -FROM nvcr.io/nvidia/pytorch:23.12-py3 |
| 22 | +# Check https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html for the base image contents |
| 23 | +# 24.06 comes with NCCL 2.21.5 |
| 24 | +FROM nvcr.io/nvidia/pytorch:24.06-py3 |
23 | 25 | ENV DEBIAN_FRONTEND=noninteractive |
24 | 26 |
|
25 | 27 | # The three must-be-built packages. |
26 | 28 | # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error. |
27 | | -ENV EFA_INSTALLER_VERSION=1.30.0 |
28 | | -ENV AWS_OFI_NCCL_VERSION=1.8.1-aws |
| 29 | +ENV EFA_INSTALLER_VERSION=1.33.0 |
| 30 | +ENV AWS_OFI_NCCL_VERSION=v1.9.2-aws |
| 31 | +ENV NCCL_VERSION=v2.21.5-1 |
29 | 32 | ENV NCCL_TESTS_VERSION=master |
30 | 33 |
|
31 | 34 | ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and |
@@ -111,16 +114,15 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH |
111 | 114 | # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the |
112 | 115 | # aws-ofi-ccnl. |
113 | 116 | #################################################################################################### |
114 | | -ENV NCCL_VERSION=2.19.3-1 |
115 | | -RUN apt-get remove -y libnccl2 libnccl-dev \ |
116 | | - && cd /tmp \ |
117 | | - && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
118 | | - && cd nccl \ |
119 | | - && make -j src.build BUILDDIR=/usr \ |
120 | | - # Build for p4 & p5. |
121 | | - NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ |
122 | | - && rm -rf /tmp/nccl \ |
123 | | - && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf |
| 117 | +#RUN apt-get remove -y libnccl2 libnccl-dev \ |
| 118 | +# && cd /tmp \ |
| 119 | +# && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ |
| 120 | +# && cd nccl \ |
| 121 | +# && make -j src.build BUILDDIR=/usr \ |
| 122 | +# # Build for p4 & p5. |
| 123 | +# NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90, -gencode=arch=compute_80,code=sm_80" \ |
| 124 | +# && rm -rf /tmp/nccl \ |
| 125 | +# && echo NCCL_SOCKET_IFNAME=^docker0,lo >> /etc/nccl.conf |
124 | 126 |
|
125 | 127 |
|
126 | 128 | #################################################################################################### |
@@ -180,7 +182,7 @@ RUN rm -fr ${OPEN_MPI_PATH} \ |
180 | 182 | # NCCL EFA Plugin |
181 | 183 | RUN mkdir -p /tmp && \ |
182 | 184 | cd /tmp && \ |
183 | | - curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
| 185 | + curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
184 | 186 | tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
185 | 187 | rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
186 | 188 | mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \ |
@@ -228,14 +230,14 @@ RUN git clone https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \ |
228 | 230 | # its own pt + cuda. |
229 | 231 | # |
230 | 232 | # Pre-requisite: build node has enough memory to compile xformers. More info on the stanza. |
231 | | -RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ |
232 | | - # On p4de.24xlarge: |
233 | | - # - MAX_JOBS=16 => 145GB memory |
234 | | - # - MAX_JOBS=32 => 241GB memory |
235 | | - # - MAX_JOBS=48 => 243GB memory, 542.5s |
236 | | - # |
237 | | - # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to |
238 | | - # work to prevent OOM. |
239 | | - export MAX_JOBS=32 && \ |
240 | | - export NVCC_PREPEND_FLAGS="-t 32" && \ |
241 | | - pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers |
| 233 | +#RUN export TORCH_CUDA_ARCH_LIST="8.0;9.0+PTX" && \ |
| 234 | +# # On p4de.24xlarge: |
| 235 | +# # - MAX_JOBS=16 => 145GB memory |
| 236 | +# # - MAX_JOBS=32 => 241GB memory |
| 237 | +# # - MAX_JOBS=48 => 243GB memory, 542.5s |
| 238 | +# # |
| 239 | +# # NOTE: must export MAX_JOBS. For some reason, `MAX_JOBS=16 pip install ...` doesn't seem to |
| 240 | +# # work to prevent OOM. |
| 241 | +# export MAX_JOBS=32 && \ |
| 242 | +# export NVCC_PREPEND_FLAGS="-t 32" && \ |
| 243 | +# pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers |
0 commit comments