From 6deb6af5c87610bd7b602a8720315f6ce8531af3 Mon Sep 17 00:00:00 2001
From: Sally Seok <sallyseo@amazon.com>
Date: Mon, 28 Apr 2025 21:23:15 -0700
Subject: [PATCH 1/3] initial changes

---
 dlc_developer_config.toml                     |  12 +-
 pytorch/training/buildspec-arm64-2-7-ec2.yml  |  72 ++++
 pytorch/training/buildspec-arm64.yml          |   2 +-
 .../docker/2.7/py3/cu128/Dockerfile.arm64.gpu | 308 ++++++++++++++++++
 4 files changed, 387 insertions(+), 7 deletions(-)
 create mode 100644 pytorch/training/buildspec-arm64-2-7-ec2.yml
 create mode 100644 pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 20663bd025d4..02a62f194a61 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -15,7 +15,7 @@ neuronx_mode = false
 graviton_mode = false
 # Please only set it to true if you are preparing a ARM64 related PR
 # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
-arm64_mode = false
+arm64_mode = true
 # Please only set it to True if you are preparing a HABANA related PR
 # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
 habana_mode = false
@@ -37,12 +37,12 @@ deep_canary_mode = false
 [build]
 # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
 # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
-build_frameworks = []
+build_frameworks = ["pytorch"]
 
 
 # By default we build both training and inference containers. Set true/false values to determine which to build.
 build_training = true
-build_inference = true
+build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
@@ -65,7 +65,7 @@ ecs_tests = true
 eks_tests = true
 ec2_tests = true
 # Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = false
+ec2_benchmark_tests = true
 
 ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
@@ -74,10 +74,10 @@ ec2_benchmark_tests = false
 ec2_tests_on_heavy_instances = false
 ### SM specific tests
 ### On by default
-sagemaker_local_tests = true
+sagemaker_local_tests = false
 
 # run standard sagemaker remote tests from test/sagemaker_tests
-sagemaker_remote_tests = true
+sagemaker_remote_tests = false
 # run efa sagemaker tests
 sagemaker_efa_tests = false
 # run release_candidate_integration tests
diff --git a/pytorch/training/buildspec-arm64-2-7-ec2.yml b/pytorch/training/buildspec-arm64-2-7-ec2.yml
new file mode 100644
index 000000000000..de5f2bce0568
--- /dev/null
+++ b/pytorch/training/buildspec-arm64-2-7-ec2.yml
@@ -0,0 +1,72 @@
+account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
+prod_account_id: &PROD_ACCOUNT_ID 763104351884
+region: &REGION <set-$REGION-in-environment>
+framework: &FRAMEWORK pytorch
+version: &VERSION 2.7.0
+short_version: &SHORT_VERSION "2.7"
+arch_type: arm64
+# autopatch_build: "True"
+
+repository_info:
+  training_repository: &TRAINING_REPOSITORY
+    image_type: &TRAINING_IMAGE_TYPE training
+    root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
+    repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
+    repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
+    release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
+    release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]
+
+context:
+  training_context: &TRAINING_CONTEXT
+    start_cuda_compat:
+      source: docker/build_artifacts/start_cuda_compat.sh
+      target: start_cuda_compat.sh
+    dockerd_entrypoint:
+      source: docker/build_artifacts/dockerd_entrypoint.sh
+      target: dockerd_entrypoint.sh
+    changehostname:
+      source: docker/build_artifacts/changehostname.c
+      target: changehostname.c
+    start_with_right_hostname:
+      source: docker/build_artifacts/start_with_right_hostname.sh
+      target: start_with_right_hostname.sh
+    example_mnist_file:
+      source: docker/build_artifacts/mnist.py
+      target: mnist.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
+
+images:
+  # BuildEC2Arm64CPUPTTrainPy3DockerImage:
+  #   <<: *TRAINING_REPOSITORY
+  #   build: &PYTORCH_CPU_TRAINING_PY3 false
+  #   image_size_baseline: 6500
+  #   device_type: &DEVICE_TYPE cpu
+  #   python_version: &DOCKER_PYTHON_VERSION py3
+  #   tag_python_version: &TAG_PYTHON_VERSION py312
+  #   os_version: &OS_VERSION ubuntu22.04
+  #   tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+  #   latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
+  #   # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2"
+  #   docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ]
+  #   target: ec2
+  #   context:
+  #     <<: *TRAINING_CONTEXT
+  BuildEC2Arm64GPUPTTrainPy3cu126DockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &PYTORCH_GPU_TRAINING_PY3 false
+    image_size_baseline: 19700
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py312
+    cuda_version: &CUDA_VERSION cu128
+    os_version: &OS_VERSION ubuntu22.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
+    # build_tag_override: "beta:2.6.0-gpu-py312-cu126-ubuntu22.04-ec2"
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64.,
+                         *DEVICE_TYPE ]
+    target: ec2
+    context:
+      <<: *TRAINING_CONTEXT
diff --git a/pytorch/training/buildspec-arm64.yml b/pytorch/training/buildspec-arm64.yml
index 73f01391a427..9d949d737ea0 100644
--- a/pytorch/training/buildspec-arm64.yml
+++ b/pytorch/training/buildspec-arm64.yml
@@ -1 +1 @@
-buildspec_pointer: buildspec-arm64-2-5-ec2.yml
+buildspec_pointer: buildspec-arm64-2-7-ec2.yml
diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu
new file mode 100644
index 000000000000..6b44f6207642
--- /dev/null
+++ b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu
@@ -0,0 +1,308 @@
+ARG PYTHON=python3
+ARG PYTHON_VERSION=3.12.8
+ARG PYTHON_SHORT_VERSION=3.12
+
+ARG CUDA_VERSION=12.8.0
+ARG CUDNN_VERSION=9.8.0.87
+ARG NCCL_VERSION=2.26.2
+ARG EFA_VERSION=1.40.0
+ARG GDRCOPY_VERSION=2.4.4
+ARG TE_VERSION=2.0
+ARG FLASH_ATTN_VERSION=2.7.3
+
+# PyTorch Binaries
+ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl
+ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-linux_aarch64.whl
+ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchaudio-2.7.0%2Bcu128-cp312-cp312-linux_aarch64.whl
+ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchtext-0.18.0%2Bcu128-cp312-cp312-linux_aarch64.whl
+ARG TORCHDATA_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchdata-0.11.0%2Bcu128-py3-none-any.whl
+
+########################################################
+#  _____ ____ ____    ___
+# | ____/ ___|___ \  |_ _|_ __ ___   __ _  __ _  ___
+# |  _|| |     __) |  | || '_ ` _ \ / _` |/ _` |/ _ \
+# | |__| |___ / __/   | || | | | | | (_| | (_| |  __/
+# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___|
+#                                         |___/
+#  ____           _
+# |  _ \ ___  ___(_)_ __   ___
+# | |_) / _ \/ __| | '_ \ / _ \
+# |  _ <  __/ (__| | |_) |  __/
+# |_| \_\___|\___|_| .__/ \___|
+#                  |_|
+########################################################
+FROM --platform=linux/arm64 nvidia/cuda:12.8.0-base-ubuntu22.04 AS ec2
+
+LABEL maintainer="Amazon AI"
+LABEL dlc_major_version="1"
+
+ARG PYTHON
+ARG PYTHON_VERSION
+ARG PYTHON_SHORT_VERSION
+
+ARG CUDA_VERSION
+ARG CUDNN_VERSION
+ARG NCCL_VERSION
+ARG EFA_VERSION
+ARG GDRCOPY_VERSION
+ARG TE_VERSION
+ARG FLASH_ATTN_VERSION
+
+ARG TORCH_URL
+ARG TORCHVISION_URL
+ARG TORCHAUDIO_URL
+ARG TORCHTEXT_URL
+ARG TORCHDATA_URL
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+
+ENV CUDA_HOME="/usr/local/cuda"
+ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/lib/aarch64-linux-gnu:${LD_LIBRARY_PATH}"
+ENV PATH="${CUDA_HOME}/bin:${PATH}"
+ENV EFA_PATH="/opt/amazon/efa"
+ENV OPEN_MPI_PATH="/opt/amazon/openmpi"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+
+# Graviton Optimization
+ENV LRU_CACHE_CAPACITY=1024 \
+    THP_MEM_ALLOC_ENABLE=1 \
+    DNNL_DEFAULT_FPMATH_MODE=BF16
+
+ENV DLC_CONTAINER_TYPE=training
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get -y upgrade --only-upgrade systemd \
+ && apt-get install -y --allow-change-held-packages --no-install-recommends \
+    automake \
+    build-essential \
+    ca-certificates \
+    cmake \
+    curl \
+    wget \
+    scons \
+    unzip \
+    emacs \
+    vim \
+    git \
+    jq \
+    cuda-toolkit-12=${CUDA_VERSION}-1 \
+    libcudnn9-cuda-12=${CUDNN_VERSION}-1 \
+    libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    openjdk-17-jdk \
+    openssl \
+    libssl-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    liblzma-dev \
+    zlib1g-dev \
+    libjpeg-dev \
+    libpng-dev \
+    libffi-dev \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Install EFA
+RUN mkdir /tmp/efa \
+&& cd /tmp/efa \
+&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
+&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
+&& cd aws-efa-installer \
+&& apt-get update \
+&& ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \
+&& rm -rf /tmp/efa \
+&& rm -rf /var/lib/apt/lists/* \
+&& apt-get clean
+
+ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}"
+
+# Configure Open MPI and configure NCCL parameters
+RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \
+ && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \
+ && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \
+ && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \
+ && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \
+ && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
+
+# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends openssh-client openssh-server \
+ && mkdir -p /var/run/sshd \
+ && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \
+ && rm -rf /var/lib/apt/lists/* \
+ && apt-get clean
+
+# Configure OpenSSH so that nodes can communicate with each other
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+RUN rm -rf /root/.ssh/ \
+ && mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+# install python
+RUN cd /tmp/ \
+&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
+&& tar xzf Python-${PYTHON_VERSION}.tgz \
+&& cd Python-${PYTHON_VERSION} \
+&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \
+&& make -j "$(nproc)" \
+&& make altinstall \
+&& cd .. \
+&& rm -rf Python-${PYTHON_VERSION} \
+&& rm Python-${PYTHON_VERSION}.tgz \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \
+&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \
+# This installation generate a .python_history file in the root directory leads sanity check to fail
+&& rm -f /root/.python_history
+
+# Python Path
+ENV PATH="/usr/local/bin:${PATH}"
+
+# this will add pip systemlink to pip${PYTHON_SHORT_VERSION}
+RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
+
+# Install pip packages
+RUN pip install --no-cache-dir \
+    cython \
+    boto3 \
+    scipy \
+    opencv-python \
+    numpy \
+    pyopenssl \
+    cryptography \
+    ipython \
+    parso \
+    awscli \
+    urllib3 \
+    idna \
+    tqdm \
+    requests \
+    mpi4py \
+    packaging \
+    ninja \
+    pybind11
+
+# Install PyTorch
+RUN pip install --no-cache-dir -U \
+    ${TORCH_URL} \
+    ${TORCHVISION_URL} \
+    ${TORCHAUDIO_URL} \
+    ${TORCHTEXT_URL} \
+    ${TORCHDATA_URL} \
+    torchtnt \
+    s3torchconnector \
+    accelerate
+
+# Install GDRCopy
+RUN cd /tmp \
+&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \
+&& cd gdrcopy \
+&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \
+&& CUDA=${CUDA_HOME} make install \
+&& rm -rf /tmp/gdrcopy
+
+# Install NCCL
+RUN cd /tmp \
+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
+ && cd nccl \
+ && make -j64 src.build BUILDDIR=/usr/local \
+ && rm -rf /tmp/nccl
+# preload system nccl for PyTorch to use if it is dynamically linking NCCL
+ENV LD_PRELOAD="/usr/local/lib/libnccl.so"
+
+# Install flash attn and NVIDIA transformer engine.
+# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install
+ENV NVTE_FRAMEWORK=pytorch
+# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
+# Set MAX_JOBS=4 to avoid OOM issues in installation process
+RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose
+# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation
+
+# OSS compliance
+RUN HOME_DIR=/root \
+ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
+ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
+ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \
+ && chmod +x /usr/local/bin/testOSSCompliance \
+ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \
+ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \
+ && cp ${HOME_DIR}/oss_compliance/build_from_source_packages/BUILD_FROM_SOURCE_PACKAGES_LICENCES_AARCH64_IMAGES ${HOME_DIR} \
+ && rm -rf ${HOME_DIR}/oss_compliance*
+
+# add license
+RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.6/license.txt
+
+# add telemetry
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py
+RUN chmod +x /usr/local/bin/deep_learning_container.py
+
+COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh
+RUN chmod +x /usr/local/bin/start_cuda_compat.sh
+
+COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh
+RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh
+
+# Cleanup
+RUN pip cache purge \
+ && rm -rf /tmp/tmp* \
+ && rm -iRf /root/.cache
+
+ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"]
+CMD ["/bin/bash"]
+
+#################################################################
+#  ____                   __  __       _
+# / ___|  __ _  __ _  ___|  \/  | __ _| | _____ _ __
+# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__|
+#  ___) | (_| | (_| |  __/ |  | | (_| |   <  __/ |
+# |____/ \__,_|\__, |\___|_|  |_|\__,_|_|\_\___|_|
+#              |___/
+#  ___                              ____           _
+# |_ _|_ __ ___   __ _  __ _  ___  |  _ \ ___  ___(_)_ __   ___
+#  | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \
+#  | || | | | | | (_| | (_| |  __/ |  _ <  __/ (__| | |_) |  __/
+# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___|
+#                      |___/                        |_|
+#
+#################################################################
+
+# FROM ec2 AS sagemaker
+
+# LABEL maintainer="Amazon AI"
+# LABEL dlc_major_version="1"
+
+# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
+
+# ARG PYTHON
+
+# # Cleanup
+# RUN pip cache purge
+#  && rm -rf /tmp/tmp* \
+#  && rm -iRf /root/.cache

From caa1cabd3fac7e315f07f5853410e342218e7a75 Mon Sep 17 00:00:00 2001
From: Sally Seok <sallyseo@amazon.com>
Date: Tue, 29 Apr 2025 16:22:09 -0700
Subject: [PATCH 2/3] add ec2 tests

---
 .../docker/2.7/py3/cu128/Dockerfile.arm64.gpu |   3 +-
 .../ec2/pytorch/training/common_cases.py      |   4 +
 .../pytorch/training/test_pytorch_training.py |  21 ++++
 .../test_pytorch_training_arm64_2_7.py        | 100 ++++++++++++++++++
 test/test_utils/__init__.py                   |  13 ++-
 5 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py

diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu
index 6b44f6207642..56334767f761 100644
--- a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu
+++ b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu
@@ -5,7 +5,8 @@ ARG PYTHON_SHORT_VERSION=3.12
 ARG CUDA_VERSION=12.8.0
 ARG CUDNN_VERSION=9.8.0.87
 ARG NCCL_VERSION=2.26.2
-ARG EFA_VERSION=1.40.0
+### update to 1.40 later
+ARG EFA_VERSION=1.38.0
 ARG GDRCOPY_VERSION=2.4.4
 ARG TE_VERSION=2.0
 ARG FLASH_ATTN_VERSION=2.7.3
diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py
index 0207805cee32..8841b113e78d 100644
--- a/test/dlc_tests/ec2/pytorch/training/common_cases.py
+++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py
@@ -57,6 +57,10 @@
     filter_function=ec2_utils.filter_efa_instance_type,
 )
 
+PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type(
+    default="g5g.16xlarge", processor="gpu", arch_type="arm64"
+)
+
 
 def pytorch_standalone(pytorch_training, ec2_connection):
     """
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
index bd61396585be..1cd7a8bba47c 100644
--- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
@@ -52,6 +52,9 @@
 
 PT_INDUCTOR_TEST_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.12xlarge", processor="gpu")
 PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.8xlarge", processor="gpu")
+PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type(
+    default="g5g.8xlarge", processor="gpu", arch_type="arm64"
+)
 PT_EC2_MULTI_GPU_NO_G_INSTANCE_TYPE = get_ec2_instance_type(
     default="g5.12xlarge",
     processor="gpu",
@@ -114,6 +117,24 @@ def test_pytorch_train_mnist_cpu_deep_canary(pytorch_training, ec2_connection, c
     execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD)
 
 
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.skipif(
+    not test_utils.is_deep_canary_context() or not os.getenv("REGION") == "us-west-2",
+    reason="This test only needs to run in deep-canary context in us-west-2",
+)
+@pytest.mark.deep_canary("Reason: This test is a simple pytorch training mnist test")
+@pytest.mark.model("mnist")
+@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True)
+@pytest.mark.parametrize(
+    "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+)
+@pytest.mark.team("conda")
+def test_pytorch_train_mnist_arm64_gpu_deep_canary(
+    pytorch_training_arm64, ec2_connection, gpu_only, ec2_instance_type
+):
+    execute_ec2_training_test(ec2_connection, pytorch_training_arm64, PT_MNIST_CMD)
+
+
 @pytest.mark.parametrize("ec2_instance_ami", [test_utils.UL22_BASE_NEURON_US_WEST_2], indirect=True)
 @pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_TRN1_INSTANCE_TYPE, indirect=True)
 @pytest.mark.integration("pytorch_neuronx_sanity_test")
diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py
new file mode 100644
index 000000000000..619ee8c09619
--- /dev/null
+++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py
@@ -0,0 +1,100 @@
+import pytest
+
+import test.test_utils as test_utils
+
+from test.test_utils import ec2
+
+from test.dlc_tests.ec2.pytorch.training import common_cases
+from test.dlc_tests.ec2 import smclarify_cases
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("pytorch_gpu_tests")
+@pytest.mark.model("N/A")
+@pytest.mark.team("conda")
+@pytest.mark.parametrize(
+    "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True
+)
+@pytest.mark.parametrize(
+    "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+)
+def test_pytorch_2_7_gpu(
+    pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training_arm64___2__7
+
+    test_cases = [
+        (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
+        (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
+    ]
+
+    if "sagemaker" in pytorch_training:
+        test_cases.append(
+            (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
+        )
+
+    # AMP must be run on multi_gpu
+    if ec2.is_instance_multi_gpu(ec2_instance_type):
+        test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU")
+
+
+# @pytest.mark.usefixtures("sagemaker")
+# @pytest.mark.integration("pytorch_gpu_heavy_tests")
+# @pytest.mark.model("N/A")
+# @pytest.mark.team("conda")
+# @pytest.mark.parametrize(
+#     "ec2_instance_type", common_cases.PT_EC2_HEAVY_GPU_ARM64_INSTANCE_TYPE, indirect=True
+# )
+# @pytest.mark.parametrize(
+#     "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+# )
+# @pytest.mark.skipif(
+#     test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(),
+#     reason="Skip GPU Heavy tests in PR context unless explicitly enabled",
+# )
+# def test_pytorch_2_7_gpu_heavy(
+#     pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type
+# ):
+#     pytorch_training = pytorch_training_arm64___2__7
+
+#     test_cases = [
+#         (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
+#         (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
+#     ]
+
+#     test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Heavy")
+
+
+@pytest.mark.usefixtures("sagemaker")
+@pytest.mark.integration("inductor")
+@pytest.mark.model("N/A")
+@pytest.mark.team("training-compiler")
+@pytest.mark.parametrize(
+    "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True
+)
+@pytest.mark.parametrize(
+    "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True
+)
+def test_pytorch_2_7_gpu_inductor(
+    pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type
+):
+    pytorch_training = pytorch_training_arm64___2__7
+
+    test_cases = [
+        (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
+        (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
+    ]
+
+    test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Inductor")
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 2787440d758e..907322225337 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -84,9 +84,20 @@ def get_ami_id_ssm(region_name, parameter_path):
         config=Config(retries={"max_attempts": 10, "mode": "standard"}),
     )
     ami = ssm_client.get_parameter(Name=parameter_path)
-    ami_id = eval(ami["Parameter"]["Value"])["image_id"]
+
+    # Special case for NVIDIA driver AMI paths
+    if "base-oss-nvidia-driver-gpu-amazon-linux-2023" in parameter_path:
+        ami_id = ami["Parameter"]["Value"]
+    else:
+        ami_id = eval(ami["Parameter"]["Value"])["image_id"]
+    
     return ami_id
 
+### temp
+AL2023_BASE_DLAMI_ARM64_US_WEST_2 = get_ami_id_ssm(
+    region_name="us-west-2",
+    parameter_path="/aws/service/deeplearning/ami/arm64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id ",
+)
 
 # DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html
 UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3(

From 69523b765f5392adf384a1b72328daf525a5fb50 Mon Sep 17 00:00:00 2001
From: Sally Seok <sallyseo@amazon.com>
Date: Tue, 29 Apr 2025 23:00:42 -0700
Subject: [PATCH 3/3] add fixture, register mark, only run ec2 test

---
 dlc_developer_config.toml         | 12 ++++++------
 test/dlc_tests/conftest.py        |  4 ++++
 test/dlc_tests/ec2/test_curand.py |  2 +-
 test/test_utils/__init__.py       |  5 +++--
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml
index 02a62f194a61..ca137bad7c16 100644
--- a/dlc_developer_config.toml
+++ b/dlc_developer_config.toml
@@ -46,7 +46,7 @@ build_inference = false
 
 # Set do_build to "false" to skip builds and test the latest image built by this PR
 # Note: at least one build is required to set do_build to "false"
-do_build = true
+do_build = false
 
 [notify]
 ### Notify on test failures
@@ -57,15 +57,15 @@ notify_test_failures = false
 
 [test]
 ### On by default
-sanity_tests = true
-security_tests = true
+sanity_tests = false
+security_tests = false
   safety_check_test = false
   ecr_scan_allowlist_feature = false
-ecs_tests = true
-eks_tests = true
+ecs_tests = false
+eks_tests = false
 ec2_tests = true
 # Set it to true if you are preparing a Benchmark related PR
-ec2_benchmark_tests = true
+ec2_benchmark_tests = false
 
 ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by
 ### default. If false, these types of tests will be skipped while other tests will run as usual.
diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py
index c49c3623ed45..a05eafa654c8 100644
--- a/test/dlc_tests/conftest.py
+++ b/test/dlc_tests/conftest.py
@@ -62,6 +62,7 @@
     "pytorch_training___1__13",
     "pytorch_training_habana",
     "pytorch_training_arm64",
+    "pytorch_training_arm64___2__7",
     "pytorch_inference",
     "pytorch_inference_eia",
     "pytorch_inference_neuron",
@@ -1494,6 +1495,9 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "skip_trcomp_containers(): mark test to skip on trcomp dlcs")
     config.addinivalue_line("markers", "deep_canary(): explicitly mark to run as deep canary test")
     config.addinivalue_line("markers", "team(team_name): mark tests that belong to a team")
+    config.addinivalue_line(
+        "markers", "skip_serialized_release_pt_test(): mark to skip test included in serial testing"
+    )
 
 
 def pytest_runtest_setup(item):
diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py
index f9e54100b92d..703497462f4e 100644
--- a/test/dlc_tests/ec2/test_curand.py
+++ b/test/dlc_tests/ec2/test_curand.py
@@ -24,6 +24,6 @@
 def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type):
     if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type):
         pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}")
-    if is_tf_version("1", training) or "mxnet" in training:
+    if is_tf_version("1", training) or "mxnet" in training or "arm64" in training:
         pytest.skip("Test is not configured for TF1 and MXNet")
     execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py
index 907322225337..2e4a707fecec 100644
--- a/test/test_utils/__init__.py
+++ b/test/test_utils/__init__.py
@@ -90,9 +90,10 @@ def get_ami_id_ssm(region_name, parameter_path):
         ami_id = ami["Parameter"]["Value"]
     else:
         ami_id = eval(ami["Parameter"]["Value"])["image_id"]
-    
+
     return ami_id
 
+
 ### temp
 AL2023_BASE_DLAMI_ARM64_US_WEST_2 = get_ami_id_ssm(
     region_name="us-west-2",
@@ -1661,7 +1662,7 @@ def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env, setup_tf2
             ).stdout.strip("\n")
             system = ctx.run("uname -s").stdout.strip("\n")
             sed_input_arg = "'' " if system == "Darwin" else ""
-            ctx.run(f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}")
+            ctx.run(rf"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}")
     return venv_dir