Skip to content

Commit c5767a3

Browse files
committed
Updated Dockerfiles based on build artifacts uploaded to the Neuron DLC Builder prod account
1 parent d247f35 commit c5767a3

File tree

6 files changed

+1046
-31
lines changed

6 files changed

+1046
-31
lines changed

common/nxdt_requirements.txt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ hydra-core>=1.3.0
22
omegaconf>=2.2,<2.3
33
pyyaml==6.0.1
44
torchmetrics>=0.4.1rc0,<=0.10.3
5-
transformers==4.52.4
5+
transformers==4.56.*
66
wandb
77
webdataset>=0.1.48,<=0.1.62
88
pandas
@@ -22,7 +22,7 @@ ftfy
2222
gdown
2323
inflect
2424
jieba
25-
opencc==1.1.6
25+
opencc==1.1.9
2626
pangu
2727
rapidfuzz
2828
pybind11
@@ -39,7 +39,6 @@ python-daemon
3939
huggingface_hub>=0.27.1
4040
multiprocess==0.70.16
4141
numba<=0.60.0
42-
numpy>=1.24.3,<=1.25.2
4342
rouge_score
4443
setuptools>=70.0
4544
lightning==2.5.0
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
ARG BUILD_STAGE=prod
2+
3+
FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base
4+
5+
LABEL dlc_major_version="1"
6+
LABEL maintainer="Amazon AI"
7+
8+
# This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 24
9+
ARG DEBIAN_FRONTEND=noninteractive
10+
ARG PYTHON=python3.12
11+
ARG PYTHON_VERSION=3.12.11
12+
ARG PIP=pip3
13+
ARG OMPI_VERSION=4.1.5
14+
ARG PYPI_SIMPLE_URL="https://pypi.org/simple/"
15+
16+
# Python won’t try to write .pyc or .pyo files on the import of source modules
17+
# Force stdin, stdout and stderr to be totally unbuffered. Good for logging
18+
ENV PYTHONDONTWRITEBYTECODE=1
19+
ENV PYTHONUNBUFFERED=1
20+
ENV PYTHONIOENCODING=UTF-8
21+
ENV LANG=C.UTF-8
22+
ENV LC_ALL=C.UTF-8
23+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib"
24+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib"
25+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64"
26+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64"
27+
ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib"
28+
ENV PATH="/opt/aws/neuron/bin:${PATH}"
29+
30+
RUN apt-get update \
31+
&& apt-get upgrade -y \
32+
&& apt-get install -y --no-install-recommends \
33+
build-essential \
34+
ca-certificates \
35+
cmake \
36+
curl \
37+
emacs \
38+
git \
39+
gnupg2 \
40+
gpg-agent \
41+
jq \
42+
libopencv-dev \
43+
libglib2.0-0 \
44+
libgl1-mesa-dri \
45+
libsm6 \
46+
libxext6 \
47+
libxrender-dev \
48+
libssl-dev \
49+
libsqlite3-dev \
50+
libgdbm-dev \
51+
libc6-dev \
52+
libbz2-dev \
53+
libncurses-dev \
54+
libffi-dev \
55+
libcap-dev \
56+
libhwloc-dev \
57+
openjdk-8-jdk-headless \
58+
openjdk-8-jdk \
59+
openjdk-8-jre \
60+
openjdk-11-jdk \
61+
openssl \
62+
software-properties-common \
63+
tk-dev \
64+
unzip \
65+
wget \
66+
vim \
67+
zlib1g-dev \
68+
&& rm -rf /var/lib/apt/lists/* \
69+
&& rm -rf /tmp/tmp* \
70+
&& apt-get clean
71+
72+
# Install Open MPI and configure SSH for MPI operator in k8s
73+
RUN mkdir -p /tmp/openmpi \
74+
&& cd /tmp/openmpi \
75+
&& wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \
76+
&& tar zxf openmpi-${OMPI_VERSION}.tar.gz \
77+
&& cd openmpi-${OMPI_VERSION} \
78+
&& ./configure --enable-orterun-prefix-by-default \
79+
&& make -j $(nproc) all \
80+
&& make install \
81+
&& ldconfig \
82+
&& rm -rf /tmp/openmpi
83+
84+
# Install packages and configure SSH for MPI operator in k8s
85+
RUN apt-get update \
86+
&& apt-get install -y openmpi-bin openssh-server \
87+
&& mkdir -p /var/run/sshd \
88+
&& echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
89+
&& echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config \
90+
&& sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config \
91+
&& rm -rf /var/lib/apt/lists/* \
92+
&& rm -rf /tmp/tmp* \
93+
&& apt-get clean
94+
95+
# Install Python
96+
RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \
97+
&& tar -xzf Python-$PYTHON_VERSION.tgz \
98+
&& cd Python-$PYTHON_VERSION \
99+
&& ./configure --enable-shared --prefix=/usr/local \
100+
&& make -j $(nproc) && make install \
101+
&& cd .. && rm -rf ../Python-$PYTHON_VERSION* \
102+
&& ln -s /usr/local/bin/pip3 /usr/bin/pip \
103+
&& ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \
104+
&& ${PIP} --no-cache-dir install --upgrade \
105+
"awscli<2" \
106+
pip \
107+
requests \
108+
setuptools \
109+
&& rm -rf ~/.cache/pip/*
110+
111+
# U24 will not allow installation of pip packages outside of venv without this flag
112+
# This is because U24 ships with Python 3.12 by default and installation into the Python
113+
# interpreter’s directory are disabled outside of a virtual environment.
114+
# https://peps.python.org/pep-0668/
115+
RUN ${PIP} config set global.break-system-packages true
116+
117+
# Install EFA
118+
RUN apt-get update \
119+
&& cd $HOME \
120+
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \
121+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \
122+
&& cat aws-efa-installer.key | gpg --fingerprint \
123+
&& wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \
124+
&& tar -xf aws-efa-installer-latest.tar.gz \
125+
&& cd aws-efa-installer \
126+
&& ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify \
127+
&& cd $HOME \
128+
&& rm -rf /var/lib/apt/lists/* \
129+
&& rm -rf /tmp/tmp* \
130+
&& apt-get clean
131+
132+
WORKDIR /
133+
134+
# The ENV variables declared below are changed in the previous section
135+
# Grouping these ENV variables in the first section causes
136+
# ompi_info to fail. This is only observed in CPU containers
137+
ENV PATH="$PATH:/home/.openmpi/bin"
138+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
139+
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
140+
141+
RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt
142+
143+
# Copy workaround script for incorrect hostname
144+
COPY changehostname.c /
145+
COPY --chmod=755 start_with_right_hostname.sh deep_learning_container.py /usr/local/bin/
146+
147+
RUN HOME_DIR=/root \
148+
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
149+
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
150+
&& cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
151+
&& chmod +x /usr/local/bin/testOSSCompliance \
152+
&& chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
153+
&& ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
154+
&& rm -rf ${HOME_DIR}/oss_compliance* \
155+
&& rm -rf /tmp/tmp*
156+
157+
# Setting up APT and PIP repo for neuron artifacts
158+
ARG NEURON_APT_REPO=apt.repos.neuron.amazonaws.com
159+
ARG NEURON_APT_REPO_KEY
160+
ARG NEURON_PIP_REPO=pip.repos.neuron.amazonaws.com
161+
ARG NEURON_PIP_REPO_KEY
162+
RUN mkdir -p /etc/apt/keyrings \
163+
&& APT_REPO_PREFIX=$([ -n "${NEURON_APT_REPO_KEY}" ] && echo "${NEURON_APT_REPO_KEY}@" || echo "") \
164+
&& echo "deb [signed-by=/etc/apt/keyrings/neuron.gpg] https://${APT_REPO_PREFIX}${NEURON_APT_REPO} jammy main" > /etc/apt/sources.list.d/neuron.list \
165+
&& curl $([ -n "${NEURON_APT_REPO_KEY}" ] && echo "-u ${NEURON_APT_REPO_KEY}") --retry 3 --retry-delay 1 --retry-all-errors -fSL "https://${NEURON_APT_REPO}/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB" | gpg --dearmor > /etc/apt/keyrings/neuron.gpg
166+
167+
# Neuron SDK components version numbers
168+
ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
169+
ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
170+
ARG NEURONX_TOOLS_VERSION=2.26.14.0
171+
ARG NEURONX_CC_VERSION=2.21.33363.0+82129205
172+
ARG NEURONX_JAX_TRAINING_VERSION=0.6.2.1.0.6446+d8c0de77
173+
174+
FROM base AS repo
175+
176+
# Install Neuron components from the apt and pip repos (latest versions)
177+
RUN apt-get update \
178+
&& apt-get install -y \
179+
aws-neuronx-tools \
180+
aws-neuronx-collectives \
181+
aws-neuronx-runtime-lib \
182+
&& rm -rf /var/lib/apt/lists/* \
183+
&& rm -rf /tmp/tmp* \
184+
&& apt-get clean
185+
186+
RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
187+
&& ${PIP} install --no-cache-dir --force-reinstall \
188+
--index-url ${PIP_REPO_URL} \
189+
--extra-index-url ${PYPI_SIMPLE_URL} \
190+
--trusted-host ${NEURON_PIP_REPO} \
191+
"neuronx-cc>=2.0" \
192+
jax-neuronx \
193+
&& rm -rf ~/.cache/pip/*
194+
195+
FROM base AS prod
196+
197+
# Install Neuron components
198+
# Install Neuron Driver, Runtime and Tools
199+
RUN apt-get update \
200+
&& apt-get install -y \
201+
aws-neuronx-tools=$NEURONX_TOOLS_VERSION \
202+
aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \
203+
aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \
204+
&& rm -rf /var/lib/apt/lists/* \
205+
&& rm -rf /tmp/tmp* \
206+
&& apt-get clean
207+
208+
# Install JAX & Neuron CC
209+
RUN PIP_REPO_URL=$([ -n "${NEURON_PIP_REPO_KEY}" ] && echo "https://${NEURON_PIP_REPO_KEY}@${NEURON_PIP_REPO}" || echo "https://${NEURON_PIP_REPO}") \
210+
&& ${PIP} install --no-cache-dir --force-reinstall \
211+
--index-url ${PIP_REPO_URL} \
212+
--trusted-host ${NEURON_PIP_REPO} \
213+
--extra-index-url ${PYPI_SIMPLE_URL} \
214+
neuronx-cc==$NEURONX_CC_VERSION \
215+
jax-neuronx==$NEURONX_JAX_TRAINING_VERSION \
216+
&& rm -rf ~/.cache/pip/*
217+
218+
FROM ${BUILD_STAGE} AS final
219+
220+
# Starts framework
221+
ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
222+
CMD ["/bin/bash"]
223+
224+
HEALTHCHECK CMD curl --fail http://localhost:8080/ping || exit 1

0 commit comments

Comments
 (0)