|
| 1 | +ARG BUILD_STAGE=prod |
| 2 | + |
| 3 | +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base |
| 4 | + |
| 5 | +LABEL dlc_major_version="1" |
| 6 | +LABEL maintainer="Amazon AI" |
| 7 | + |
| 8 | +# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24 |
| 9 | +ARG DEBIAN_FRONTEND=noninteractive |
| 10 | +ARG PYTHON=python3.12 |
| 11 | +ARG PYTHON_VERSION=3.12.11 |
| 12 | +ARG PIP=pip3 |
| 13 | +ARG OMPI_VERSION=4.1.5 |
| 14 | +ARG PYPI_SIMPLE_URL="https://pypi.org/simple/" |
| 15 | + |
| 16 | +# Python won’t try to write .pyc or .pyo files on the import of source modules |
| 17 | +# Force stdin, stdout and stderr to be totally unbuffered. Good for logging |
| 18 | +ENV PYTHONDONTWRITEBYTECODE=1 |
| 19 | +ENV PYTHONUNBUFFERED=1 |
| 20 | +ENV PYTHONIOENCODING=UTF-8 |
| 21 | +ENV LANG=C.UTF-8 |
| 22 | +ENV LC_ALL=C.UTF-8 |
| 23 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" |
| 24 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" |
| 25 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" |
| 26 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" |
| 27 | +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" |
| 28 | +ENV PATH="/opt/aws/neuron/bin:${PATH}" |
| 29 | + |
| 30 | +RUN apt-get update \ |
| 31 | + && apt-get upgrade -y \ |
| 32 | + && apt-get install -y --no-install-recommends \ |
| 33 | + build-essential \ |
| 34 | + ca-certificates \ |
| 35 | + cmake \ |
| 36 | + curl \ |
| 37 | + emacs \ |
| 38 | + git \ |
| 39 | + gnupg2 \ |
| 40 | + gpg-agent \ |
| 41 | + jq \ |
| 42 | + libopencv-dev \ |
| 43 | + libglib2.0-0 \ |
| 44 | + libgl1-mesa-dri \ |
| 45 | + libsm6 \ |
| 46 | + libxext6 \ |
| 47 | + libxrender-dev \ |
| 48 | + libssl-dev \ |
| 49 | + libsqlite3-dev \ |
| 50 | + libgdbm-dev \ |
| 51 | + libc6-dev \ |
| 52 | + libbz2-dev \ |
| 53 | + libncurses-dev \ |
| 54 | + libffi-dev \ |
| 55 | + libcap-dev \ |
| 56 | + libhwloc-dev \ |
| 57 | + openjdk-8-jdk-headless \ |
| 58 | + openjdk-8-jdk \ |
| 59 | + openjdk-8-jre \ |
| 60 | + openjdk-11-jdk \ |
| 61 | + openssl \ |
| 62 | + software-properties-common \ |
| 63 | + tk-dev \ |
| 64 | + unzip \ |
| 65 | + wget \ |
| 66 | + vim \ |
| 67 | + zlib1g-dev \ |
| 68 | + && rm -rf /var/lib/apt/lists/* \ |
| 69 | + && rm -rf /tmp/tmp* \ |
| 70 | + && apt-get clean |
| 71 | + |
| 72 | +# Install Open MPI and configure SSH for MPI operator in k8s |
| 73 | +RUN mkdir -p /tmp/openmpi \ |
| 74 | + && cd /tmp/openmpi \ |
| 75 | + && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ |
| 76 | + && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ |
| 77 | + && cd openmpi-${OMPI_VERSION} \ |
| 78 | + && ./configure --enable-orterun-prefix-by-default \ |
| 79 | + && make -j $(nproc) all \ |
| 80 | + && make install \ |
| 81 | + && ldconfig \ |
| 82 | + && rm -rf /tmp/openmpi |
| 83 | + |
| 84 | +# Install packages and configure SSH for MPI operator in k8s |
| 85 | +RUN apt-get update \ |
| 86 | + && apt-get install -y openmpi-bin openssh-server \ |
| 87 | + && mkdir -p /var/run/sshd \ |
| 88 | + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ |
| 89 | + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \ |
| 90 | + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \ |
| 91 | + && rm -rf /var/lib/apt/lists/* \ |
| 92 | + && rm -rf /tmp/tmp* \ |
| 93 | + && apt-get clean |
| 94 | + |
| 95 | +# Install Python |
| 96 | +RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ |
| 97 | + && tar -xzf Python-$PYTHON_VERSION.tgz \ |
| 98 | + && cd Python-$PYTHON_VERSION \ |
| 99 | + && ./configure --enable-shared --prefix=/usr/local \ |
| 100 | + && make -j $(nproc) && make install \ |
| 101 | + && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ |
| 102 | + && ln -s /usr/local/bin/pip3 /usr/bin/pip \ |
| 103 | + && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ |
| 104 | + && ${PIP} --no-cache-dir install --upgrade \ |
| 105 | + "awscli<2" \ |
| 106 | + pip \ |
| 107 | + requests \ |
| 108 | + setuptools \ |
| 109 | + && rm -rf ~/.cache/pip/* |
| 110 | + |
| 111 | +# U24 will not allow installation of pip packages outside of venv without this flag |
| 112 | +# This is because U24 ships with Python 3.12 by default and installation into the Python |
| 113 | +# interpreter’s directory are disabled outside of a virtual environment. |
| 114 | +# https://peps.python.org/pep-0668/ |
| 115 | +RUN ${PIP} config set global.break-system-packages true |
| 116 | + |
| 117 | +# Install EFA |
| 118 | +RUN apt-get update \ |
| 119 | + && cd $HOME \ |
| 120 | + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ |
| 121 | + && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ |
| 122 | + && cat aws-efa-installer.key | gpg --fingerprint \ |
| 123 | + && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ |
| 124 | + && tar -xf aws-efa-installer-latest.tar.gz \ |
| 125 | + && cd aws-efa-installer \ |
| 126 | + && ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \ |
| 127 | + && cd $HOME \ |
| 128 | + && rm -rf /var/lib/apt/lists/* \ |
| 129 | + && rm -rf /tmp/tmp* \ |
| 130 | + && apt-get clean |
| 131 | + |
| 132 | +WORKDIR / |
| 133 | + |
| 134 | +# The ENV variables declared below are changed in the previous section |
| 135 | +# Grouping these ENV variables in the first section causes |
| 136 | +# ompi_info to fail. This is only observed in CPU containers |
| 137 | +ENV PATH="$PATH:/home/.openmpi/bin" |
| 138 | +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" |
| 139 | +RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value |
| 140 | + |
| 141 | +RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt |
| 142 | + |
| 143 | +# Copy workaround script for incorrect hostname |
| 144 | +COPY changehostname.c / |
| 145 | +COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/ |
| 146 | + |
| 147 | +RUN HOME_DIR=/root \ |
| 148 | + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ |
| 149 | + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ |
| 150 | + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ |
| 151 | + && chmod +x /usr/local/bin/testOSSCompliance \ |
| 152 | + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ |
| 153 | + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ |
| 154 | + && rm -rf ${HOME_DIR}/oss_compliance* \ |
| 155 | + && rm -rf /tmp/tmp* |
| 156 | + |
| 157 | +# Setting up APT and PIP repo for neuron artifacts |
| 158 | +ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com |
| 159 | +ARG NEURON_APT_REPO_KEY |
| 160 | +ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com |
| 161 | +ARG NEURON_PIP_REPO_KEY |
| 162 | +RUN mkdir -p /etc/apt/keyrings \ |
| 163 | + && APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \ |
| 164 | + && echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \ |
| 165 | + && curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg |
| 166 | + |
| 167 | +# Neuron SDK components version numbers |
| 168 | +ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008 |
| 169 | +ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58 |
| 170 | +ARG NEURONX_TOOLS_VERSION=2.26.14.0 |
| 171 | +ARG NEURONX_CC_VERSION=2.21.33363.0+82129205 |
| 172 | +ARG NEURONX_JAX_TRAINING_VERSION=0.6.2.1.0.6446+d8c0de77 |
| 173 | + |
| 174 | +FROM base AS repo |
| 175 | + |
| 176 | +# Install Neuron components from the apt and pip repos (latest versions) |
| 177 | +RUN apt-get update \ |
| 178 | + && apt-get install -y \ |
| 179 | + aws-neuronx-tools \ |
| 180 | + aws-neuronx-collectives \ |
| 181 | + aws-neuronx-runtime-lib \ |
| 182 | + && rm -rf /var/lib/apt/lists/* \ |
| 183 | + && rm -rf /tmp/tmp* \ |
| 184 | + && apt-get clean |
| 185 | + |
| 186 | +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ |
| 187 | + && ${PIP} install --no-cache-dir --force-reinstall \ |
| 188 | + --index-url ${PIP_REPO_URL} \ |
| 189 | + --extra-index-url ${PYPI_SIMPLE_URL} \ |
| 190 | + --trusted-host ${NEURON_PIP_REPO} \ |
| 191 | + "neuronx-cc>=2.0" \ |
| 192 | + jax-neuronx \ |
| 193 | + && rm -rf ~/.cache/pip/* |
| 194 | + |
| 195 | +FROM base AS prod |
| 196 | + |
| 197 | +# Install Neuron components |
| 198 | +# Install Neuron Driver, Runtime and Tools |
| 199 | +RUN apt-get update \ |
| 200 | + && apt-get install -y \ |
| 201 | + aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ |
| 202 | + aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ |
| 203 | + aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ |
| 204 | + && rm -rf /var/lib/apt/lists/* \ |
| 205 | + && rm -rf /tmp/tmp* \ |
| 206 | + && apt-get clean |
| 207 | + |
| 208 | +# Install JAX & Neuron CC |
| 209 | +RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \ |
| 210 | + && ${PIP} install --no-cache-dir --force-reinstall \ |
| 211 | + --index-url ${PIP_REPO_URL} \ |
| 212 | + --trusted-host ${NEURON_PIP_REPO} \ |
| 213 | + --extra-index-url ${PYPI_SIMPLE_URL} \ |
| 214 | + neuronx-cc==$NEURONX_CC_VERSION \ |
| 215 | + jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \ |
| 216 | + && rm -rf ~/.cache/pip/* |
| 217 | + |
| 218 | +FROM ${BUILD_STAGE} AS final |
| 219 | + |
| 220 | +# Starts framework |
| 221 | +ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"] |
| 222 | +CMD ["/bin/bash"] |
| 223 | + |
| 224 | +HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1 |
0 commit comments