From 6deb6af5c87610bd7b602a8720315f6ce8531af3 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Mon, 28 Apr 2025 21:23:15 -0700 Subject: [PATCH 1/3] initial changes --- dlc_developer_config.toml | 12 +- pytorch/training/buildspec-arm64-2-7-ec2.yml | 72 ++++ pytorch/training/buildspec-arm64.yml | 2 +- .../docker/2.7/py3/cu128/Dockerfile.arm64.gpu | 308 ++++++++++++++++++ 4 files changed, 387 insertions(+), 7 deletions(-) create mode 100644 pytorch/training/buildspec-arm64-2-7-ec2.yml create mode 100644 pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 20663bd025d4..02a62f194a61 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -15,7 +15,7 @@ neuronx_mode = false graviton_mode = false # Please only set it to true if you are preparing a ARM64 related PR # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) -arm64_mode = false +arm64_mode = true # Please only set it to True if you are preparing a HABANA related PR # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -65,7 +65,7 @@ ecs_tests = true eks_tests = true ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = false +ec2_benchmark_tests = true ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. @@ -74,10 +74,10 @@ ec2_benchmark_tests = false ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests sagemaker_efa_tests = false # run release_candidate_integration tests diff --git a/pytorch/training/buildspec-arm64-2-7-ec2.yml b/pytorch/training/buildspec-arm64-2-7-ec2.yml new file mode 100644 index 000000000000..de5f2bce0568 --- /dev/null +++ b/pytorch/training/buildspec-arm64-2-7-ec2.yml @@ -0,0 +1,72 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.7.0 +short_version: &SHORT_VERSION "2.7" +arch_type: arm64 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + # BuildEC2Arm64CPUPTTrainPy3DockerImage: + # <<: *TRAINING_REPOSITORY + # build: &PYTORCH_CPU_TRAINING_PY3 false + # image_size_baseline: 6500 + # device_type: &DEVICE_TYPE cpu + # python_version: &DOCKER_PYTHON_VERSION py3 + # tag_python_version: &TAG_PYTHON_VERSION py312 + # os_version: &OS_VERSION ubuntu22.04 + # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2" + # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ] + # target: ec2 + # context: + # <<: *TRAINING_CONTEXT + BuildEC2Arm64GPUPTTrainPy3cu126DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 19700 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu128 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.6.0-gpu-py312-cu126-ubuntu22.04-ec2" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-arm64.yml b/pytorch/training/buildspec-arm64.yml index 73f01391a427..9d949d737ea0 100644 --- a/pytorch/training/buildspec-arm64.yml +++ b/pytorch/training/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-5-ec2.yml +buildspec_pointer: buildspec-arm64-2-7-ec2.yml diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu new file mode 100644 index 000000000000..6b44f6207642 --- /dev/null +++ b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu @@ -0,0 +1,308 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.8 +ARG PYTHON_SHORT_VERSION=3.12 + +ARG CUDA_VERSION=12.8.0 +ARG CUDNN_VERSION=9.8.0.87 +ARG NCCL_VERSION=2.26.2 +ARG EFA_VERSION=1.40.0 +ARG GDRCOPY_VERSION=2.4.4 +ARG TE_VERSION=2.0 +ARG FLASH_ATTN_VERSION=2.7.3 + +# PyTorch Binaries +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchaudio-2.7.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchtext-0.18.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHDATA_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchdata-0.11.0%2Bcu128-py3-none-any.whl + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## +FROM --platform=linux/arm64 nvidia/cuda:12.8.0-base-ubuntu22.04 AS ec2 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION + +ARG CUDA_VERSION +ARG CUDNN_VERSION +ARG NCCL_VERSION +ARG EFA_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION + +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL +ARG TORCHDATA_URL + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV CUDA_HOME="/usr/local/cuda" +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/aarch64-linux-gnu:${LD_LIBRARY_PATH}" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +# Graviton Optimization +ENV LRU_CACHE_CAPACITY=1024 \ + THP_MEM_ALLOC_ENABLE=1 \ + DNNL_DEFAULT_FPMATH_MODE=BF16 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + wget \ + scons \ + unzip \ + emacs \ + vim \ + git \ + jq \ + cuda-toolkit-12=${CUDA_VERSION}-1 \ + libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-17-jdk \ + openssl \ + libssl-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + zlib1g-dev \ + libjpeg-dev \ + libpng-dev \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install EFA +RUN mkdir /tmp/efa \ +&& cd /tmp/efa \ +&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ +&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ +&& cd aws-efa-installer \ +&& apt-get update \ +&& ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \ +&& rm -rf /tmp/efa \ +&& rm -rf /var/lib/apt/lists/* \ +&& apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}" + +# Configure Open MPI and configure NCCL parameters +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \ + && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install pip packages +RUN pip install --no-cache-dir \ + cython \ + boto3 \ + scipy \ + opencv-python \ + numpy \ + pyopenssl \ + cryptography \ + ipython \ + parso \ + awscli \ + urllib3 \ + idna \ + tqdm \ + requests \ + mpi4py \ + packaging \ + ninja \ + pybind11 + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + ${TORCHDATA_URL} \ + torchtnt \ + s3torchconnector \ + accelerate + +# Install GDRCopy +RUN cd /tmp \ +&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ +&& cd gdrcopy \ +&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ +&& CUDA=${CUDA_HOME} make install \ +&& rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl +# preload system nccl for PyTorch to use if it is dynamically linking NCCL +ENV LD_PRELOAD="/usr/local/lib/libnccl.so" + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +# OSS compliance +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && cp ${HOME_DIR}/oss_compliance/build_from_source_packages/BUILD_FROM_SOURCE_PACKAGES_LICENCES_AARCH64_IMAGES ${HOME_DIR} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +# add license +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.6/license.txt + +# add telemetry +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +# Cleanup +RUN pip cache purge \ + && rm -rf /tmp/tmp* \ + && rm -iRf /root/.cache + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +# +################################################################# + +# FROM ec2 AS sagemaker + +# LABEL maintainer="Amazon AI" +# LABEL dlc_major_version="1" + +# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +# ARG PYTHON + +# # Cleanup +# RUN pip cache purge +# && rm -rf /tmp/tmp* \ +# && rm -iRf /root/.cache From caa1cabd3fac7e315f07f5853410e342218e7a75 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Tue, 29 Apr 2025 16:22:09 -0700 Subject: [PATCH 2/3] add ec2 tests --- .../docker/2.7/py3/cu128/Dockerfile.arm64.gpu | 3 +- .../ec2/pytorch/training/common_cases.py | 4 + .../pytorch/training/test_pytorch_training.py | 21 ++++ .../test_pytorch_training_arm64_2_7.py | 100 ++++++++++++++++++ test/test_utils/__init__.py | 13 ++- 5 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu index 6b44f6207642..56334767f761 100644 --- a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu +++ b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu @@ -5,7 +5,8 @@ ARG PYTHON_SHORT_VERSION=3.12 ARG CUDA_VERSION=12.8.0 ARG CUDNN_VERSION=9.8.0.87 ARG NCCL_VERSION=2.26.2 -ARG EFA_VERSION=1.40.0 +### update to 1.40 later +ARG EFA_VERSION=1.38.0 ARG GDRCOPY_VERSION=2.4.4 ARG TE_VERSION=2.0 ARG FLASH_ATTN_VERSION=2.7.3 diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 0207805cee32..8841b113e78d 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -57,6 +57,10 @@ filter_function=ec2_utils.filter_efa_instance_type, ) +PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type( + default="g5g.16xlarge", processor="gpu", arch_type="arm64" +) + def pytorch_standalone(pytorch_training, ec2_connection): """ diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index bd61396585be..1cd7a8bba47c 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -52,6 +52,9 @@ PT_INDUCTOR_TEST_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.12xlarge", processor="gpu") PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.8xlarge", processor="gpu") +PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type( + default="g5g.8xlarge", processor="gpu", arch_type="arm64" +) PT_EC2_MULTI_GPU_NO_G_INSTANCE_TYPE = get_ec2_instance_type( default="g5.12xlarge", processor="gpu", @@ -114,6 +117,24 @@ def test_pytorch_train_mnist_cpu_deep_canary(pytorch_training, ec2_connection, c execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD) +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.skipif( + not test_utils.is_deep_canary_context() or not os.getenv("REGION") == "us-west-2", + reason="This test only needs to run in deep-canary context in us-west-2", +) +@pytest.mark.deep_canary("Reason: This test is a simple pytorch training mnist test") +@pytest.mark.model("mnist") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +@pytest.mark.team("conda") +def test_pytorch_train_mnist_arm64_gpu_deep_canary( + pytorch_training_arm64, ec2_connection, gpu_only, ec2_instance_type +): + execute_ec2_training_test(ec2_connection, pytorch_training_arm64, PT_MNIST_CMD) + + @pytest.mark.parametrize("ec2_instance_ami", [test_utils.UL22_BASE_NEURON_US_WEST_2], indirect=True) @pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_TRN1_INSTANCE_TYPE, indirect=True) @pytest.mark.integration("pytorch_neuronx_sanity_test") diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py new file mode 100644 index 000000000000..619ee8c09619 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py @@ -0,0 +1,100 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True +) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_2_7_gpu( + pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training_arm64___2__7 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU") + + +# @pytest.mark.usefixtures("sagemaker") +# @pytest.mark.integration("pytorch_gpu_heavy_tests") +# @pytest.mark.model("N/A") +# @pytest.mark.team("conda") +# @pytest.mark.parametrize( +# "ec2_instance_type", common_cases.PT_EC2_HEAVY_GPU_ARM64_INSTANCE_TYPE, indirect=True +# ) +# @pytest.mark.parametrize( +# "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +# ) +# @pytest.mark.skipif( +# test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), +# reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +# ) +# def test_pytorch_2_7_gpu_heavy( +# pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +# ): +# pytorch_training = pytorch_training_arm64___2__7 + +# test_cases = [ +# (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), +# (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), +# ] + +# test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True +) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_2_7_gpu_inductor( + pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training_arm64___2__7 + + test_cases = [ + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Inductor") diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 2787440d758e..907322225337 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -84,9 +84,20 @@ def get_ami_id_ssm(region_name, parameter_path): config=Config(retries={"max_attempts": 10, "mode": "standard"}), ) ami = ssm_client.get_parameter(Name=parameter_path) - ami_id = eval(ami["Parameter"]["Value"])["image_id"] + + # Special case for NVIDIA driver AMI paths + if "base-oss-nvidia-driver-gpu-amazon-linux-2023" in parameter_path: + ami_id = ami["Parameter"]["Value"] + else: + ami_id = eval(ami["Parameter"]["Value"])["image_id"] + return ami_id +### temp +AL2023_BASE_DLAMI_ARM64_US_WEST_2 = get_ami_id_ssm( + region_name="us-west-2", + parameter_path="/aws/service/deeplearning/ami/arm64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id ", +) # DLAMI Base is split between OSS Nvidia Driver and Propietary Nvidia Driver. see https://docs.aws.amazon.com/dlami/latest/devguide/important-changes.html UBUNTU_20_BASE_OSS_DLAMI_US_WEST_2 = get_ami_id_boto3( From 69523b765f5392adf384a1b72328daf525a5fb50 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Tue, 29 Apr 2025 23:00:42 -0700 Subject: [PATCH 3/3] add fixture, register mark, only run ec2 test --- dlc_developer_config.toml | 12 ++++++------ test/dlc_tests/conftest.py | 4 ++++ test/dlc_tests/ec2/test_curand.py | 2 +- test/test_utils/__init__.py | 5 +++-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 02a62f194a61..ca137bad7c16 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -46,7 +46,7 @@ build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures @@ -57,15 +57,15 @@ notify_test_failures = false [test] ### On by default -sanity_tests = true -security_tests = true +sanity_tests = false +security_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true +ecs_tests = false +eks_tests = false ec2_tests = true # Set it to true if you are preparing a Benchmark related PR -ec2_benchmark_tests = true +ec2_benchmark_tests = false ### Set ec2_tests_on_heavy_instances = true to be able to run any EC2 tests that use large/expensive instance types by ### default. If false, these types of tests will be skipped while other tests will run as usual. diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index c49c3623ed45..a05eafa654c8 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -62,6 +62,7 @@ "pytorch_training___1__13", "pytorch_training_habana", "pytorch_training_arm64", + "pytorch_training_arm64___2__7", "pytorch_inference", "pytorch_inference_eia", "pytorch_inference_neuron", @@ -1494,6 +1495,9 @@ def pytest_configure(config): config.addinivalue_line("markers", "skip_trcomp_containers(): mark test to skip on trcomp dlcs") config.addinivalue_line("markers", "deep_canary(): explicitly mark to run as deep canary test") config.addinivalue_line("markers", "team(team_name): mark tests that belong to a team") + config.addinivalue_line( + "markers", "skip_serialized_release_pt_test(): mark to skip test included in serial testing" + ) def pytest_runtest_setup(item): diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index f9e54100b92d..703497462f4e 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -24,6 +24,6 @@ def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") - if is_tf_version("1", training) or "mxnet" in training: + if is_tf_version("1", training) or "mxnet" in training or "arm64" in training: pytest.skip("Test is not configured for TF1 and MXNet") execute_ec2_training_test(ec2_connection, training, CURAND_CMD) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 907322225337..2e4a707fecec 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -90,9 +90,10 @@ def get_ami_id_ssm(region_name, parameter_path): ami_id = ami["Parameter"]["Value"] else: ami_id = eval(ami["Parameter"]["Value"])["image_id"] - + return ami_id + ### temp AL2023_BASE_DLAMI_ARM64_US_WEST_2 = get_ami_id_ssm( region_name="us-west-2", @@ -1661,7 +1662,7 @@ def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env, setup_tf2 ).stdout.strip("\n") system = ctx.run("uname -s").stdout.strip("\n") sed_input_arg = "'' " if system == "Darwin" else "" - ctx.run(f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}") + ctx.run(rf"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}") return venv_dir