diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 20663bd025d4..ca137bad7c16 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -15,7 +15,7 @@ neuronx_mode = false graviton_mode = false # Please only set it to true if you are preparing a ARM64 related PR # Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR) -arm64_mode = false +arm64_mode = true # Please only set it to True if you are preparing a HABANA related PR # Do remember to revert it back to False before merging any PR (including HABANA dedicated PR) habana_mode = false @@ -37,16 +37,16 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["pytorch"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" -do_build = true +do_build = false [notify] ### Notify on test failures @@ -57,12 +57,12 @@ notify_test_failures = false [test] ### On by default -sanity_tests = true -security_tests = true +sanity_tests = false +security_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true +ecs_tests = false +eks_tests = false ec2_tests = true # Set it to true if you are preparing a Benchmark related PR ec2_benchmark_tests = false @@ -74,10 +74,10 @@ ec2_benchmark_tests = false ec2_tests_on_heavy_instances = false ### SM specific tests ### On by default -sagemaker_local_tests = true +sagemaker_local_tests = false # run standard sagemaker remote tests from test/sagemaker_tests -sagemaker_remote_tests = true +sagemaker_remote_tests = false # run efa sagemaker tests sagemaker_efa_tests = false # run release_candidate_integration tests diff --git a/pytorch/training/buildspec-arm64-2-7-ec2.yml b/pytorch/training/buildspec-arm64-2-7-ec2.yml new file mode 100644 index 000000000000..de5f2bce0568 --- /dev/null +++ b/pytorch/training/buildspec-arm64-2-7-ec2.yml @@ -0,0 +1,72 @@ +account_id: &ACCOUNT_ID +prod_account_id: &PROD_ACCOUNT_ID 763104351884 +region: ®ION +framework: &FRAMEWORK pytorch +version: &VERSION 2.7.0 +short_version: &SHORT_VERSION "2.7" +arch_type: arm64 +# autopatch_build: "True" + +repository_info: + training_repository: &TRAINING_REPOSITORY + image_type: &TRAINING_IMAGE_TYPE training + root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ] + repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ] + repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ] + release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ] + release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ] + +context: + training_context: &TRAINING_CONTEXT + start_cuda_compat: + source: docker/build_artifacts/start_cuda_compat.sh + target: start_cuda_compat.sh + dockerd_entrypoint: + source: docker/build_artifacts/dockerd_entrypoint.sh + target: dockerd_entrypoint.sh + changehostname: + source: docker/build_artifacts/changehostname.c + target: changehostname.c + start_with_right_hostname: + source: docker/build_artifacts/start_with_right_hostname.sh + target: start_with_right_hostname.sh + example_mnist_file: + source: docker/build_artifacts/mnist.py + target: mnist.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py + +images: + # BuildEC2Arm64CPUPTTrainPy3DockerImage: + # <<: *TRAINING_REPOSITORY + # build: &PYTORCH_CPU_TRAINING_PY3 false + # image_size_baseline: 6500 + # device_type: &DEVICE_TYPE cpu + # python_version: &DOCKER_PYTHON_VERSION py3 + # tag_python_version: &TAG_PYTHON_VERSION py312 + # os_version: &OS_VERSION ubuntu22.04 + # tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ] + # # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2" + # docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ] + # target: ec2 + # context: + # <<: *TRAINING_CONTEXT + BuildEC2Arm64GPUPTTrainPy3cu126DockerImage: + <<: *TRAINING_REPOSITORY + build: &PYTORCH_GPU_TRAINING_PY3 false + image_size_baseline: 19700 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py312 + cuda_version: &CUDA_VERSION cu128 + os_version: &OS_VERSION ubuntu22.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ] + # build_tag_override: "beta:2.6.0-gpu-py312-cu126-ubuntu22.04-ec2" + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64., + *DEVICE_TYPE ] + target: ec2 + context: + <<: *TRAINING_CONTEXT diff --git a/pytorch/training/buildspec-arm64.yml b/pytorch/training/buildspec-arm64.yml index 73f01391a427..9d949d737ea0 100644 --- a/pytorch/training/buildspec-arm64.yml +++ b/pytorch/training/buildspec-arm64.yml @@ -1 +1 @@ -buildspec_pointer: buildspec-arm64-2-5-ec2.yml +buildspec_pointer: buildspec-arm64-2-7-ec2.yml diff --git a/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu new file mode 100644 index 000000000000..56334767f761 --- /dev/null +++ b/pytorch/training/docker/2.7/py3/cu128/Dockerfile.arm64.gpu @@ -0,0 +1,309 @@ +ARG PYTHON=python3 +ARG PYTHON_VERSION=3.12.8 +ARG PYTHON_SHORT_VERSION=3.12 + +ARG CUDA_VERSION=12.8.0 +ARG CUDNN_VERSION=9.8.0.87 +ARG NCCL_VERSION=2.26.2 +### update to 1.40 later +ARG EFA_VERSION=1.38.0 +ARG GDRCOPY_VERSION=2.4.4 +ARG TE_VERSION=2.0 +ARG FLASH_ATTN_VERSION=2.7.3 + +# PyTorch Binaries +ARG TORCH_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl +ARG TORCHVISION_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchvision-0.22.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHAUDIO_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchaudio-2.7.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHTEXT_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchtext-0.18.0%2Bcu128-cp312-cp312-linux_aarch64.whl +ARG TORCHDATA_URL=https://framework-binaries.s3.us-west-2.amazonaws.com/pytorch/v2.7.0/arm64/cu128/torchdata-0.11.0%2Bcu128-py3-none-any.whl + +######################################################## +# _____ ____ ____ ___ +# | ____/ ___|___ \ |_ _|_ __ ___ __ _ __ _ ___ +# | _|| | __) | | || '_ ` _ \ / _` |/ _` |/ _ \ +# | |__| |___ / __/ | || | | | | | (_| | (_| | __/ +# |_____\____|_____| |___|_| |_| |_|\__,_|\__, |\___| +# |___/ +# ____ _ +# | _ \ ___ ___(_)_ __ ___ +# | |_) / _ \/ __| | '_ \ / _ \ +# | _ < __/ (__| | |_) | __/ +# |_| \_\___|\___|_| .__/ \___| +# |_| +######################################################## +FROM --platform=linux/arm64 nvidia/cuda:12.8.0-base-ubuntu22.04 AS ec2 + +LABEL maintainer="Amazon AI" +LABEL dlc_major_version="1" + +ARG PYTHON +ARG PYTHON_VERSION +ARG PYTHON_SHORT_VERSION + +ARG CUDA_VERSION +ARG CUDNN_VERSION +ARG NCCL_VERSION +ARG EFA_VERSION +ARG GDRCOPY_VERSION +ARG TE_VERSION +ARG FLASH_ATTN_VERSION + +ARG TORCH_URL +ARG TORCHVISION_URL +ARG TORCHAUDIO_URL +ARG TORCHTEXT_URL +ARG TORCHDATA_URL + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +ENV CUDA_HOME="/usr/local/cuda" +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" +ENV LD_LIBRARY_PATH="/lib/aarch64-linux-gnu:${LD_LIBRARY_PATH}" +ENV PATH="${CUDA_HOME}/bin:${PATH}" +ENV EFA_PATH="/opt/amazon/efa" +ENV OPEN_MPI_PATH="/opt/amazon/openmpi" +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" + +# Graviton Optimization +ENV LRU_CACHE_CAPACITY=1024 \ + THP_MEM_ALLOC_ENABLE=1 \ + DNNL_DEFAULT_FPMATH_MODE=BF16 + +ENV DLC_CONTAINER_TYPE=training +WORKDIR / + +RUN apt-get update \ + && apt-get -y upgrade --only-upgrade systemd \ + && apt-get install -y --allow-change-held-packages --no-install-recommends \ + automake \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + wget \ + scons \ + unzip \ + emacs \ + vim \ + git \ + jq \ + cuda-toolkit-12=${CUDA_VERSION}-1 \ + libcudnn9-cuda-12=${CUDNN_VERSION}-1 \ + libcudnn9-dev-cuda-12=${CUDNN_VERSION}-1 \ + libgl1-mesa-glx \ + libglib2.0-0 \ + libsm6 \ + libxext6 \ + libxrender-dev \ + openjdk-17-jdk \ + openssl \ + libssl-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + liblzma-dev \ + zlib1g-dev \ + libjpeg-dev \ + libpng-dev \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install EFA +RUN mkdir /tmp/efa \ +&& cd /tmp/efa \ +&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \ +&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \ +&& cd aws-efa-installer \ +&& apt-get update \ +&& ./efa_installer.sh -y --skip-kmod --skip-limit-conf --no-verify \ +&& rm -rf /tmp/efa \ +&& rm -rf /var/lib/apt/lists/* \ +&& apt-get clean + +ENV PATH="${OPEN_MPI_PATH}/bin:${EFA_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${OPEN_MPI_PATH}/lib:${EFA_PATH}/lib:${LD_LIBRARY_PATH}" + +# Configure Open MPI and configure NCCL parameters +RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real \ + && echo '#!/bin/bash' > ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "${OPEN_MPI_PATH}/bin/mpirun.real --allow-run-as-root \"\$@\"" >> ${OPEN_MPI_PATH}/bin/mpirun \ + && chmod a+x ${OPEN_MPI_PATH}/bin/mpirun \ + && echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf \ + && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \ + && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf + +# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation +RUN apt-get update \ + && apt-get install -y --no-install-recommends openssh-client openssh-server \ + && mkdir -p /var/run/sshd \ + && cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ + && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ + && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Configure OpenSSH so that nodes can communicate with each other +RUN mkdir -p /var/run/sshd \ + && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd + +RUN rm -rf /root/.ssh/ \ + && mkdir -p /root/.ssh/ \ + && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \ + && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \ + && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config + +# install python +RUN cd /tmp/ \ +&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ +&& tar xzf Python-${PYTHON_VERSION}.tgz \ +&& cd Python-${PYTHON_VERSION} \ +&& ./configure --enable-optimizations --with-lto --with-computed-gotos --with-system-ffi \ +&& make -j "$(nproc)" \ +&& make altinstall \ +&& cd .. \ +&& rm -rf Python-${PYTHON_VERSION} \ +&& rm Python-${PYTHON_VERSION}.tgz \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python \ +&& ln -s /usr/local/bin/python${PYTHON_SHORT_VERSION} /usr/local/bin/python3 \ +# This installation generate a .python_history file in the root directory leads sanity check to fail +&& rm -f /root/.python_history + +# Python Path +ENV PATH="/usr/local/bin:${PATH}" + +# this will add pip systemlink to pip${PYTHON_SHORT_VERSION} +RUN python -m pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org + +# Install pip packages +RUN pip install --no-cache-dir \ + cython \ + boto3 \ + scipy \ + opencv-python \ + numpy \ + pyopenssl \ + cryptography \ + ipython \ + parso \ + awscli \ + urllib3 \ + idna \ + tqdm \ + requests \ + mpi4py \ + packaging \ + ninja \ + pybind11 + +# Install PyTorch +RUN pip install --no-cache-dir -U \ + ${TORCH_URL} \ + ${TORCHVISION_URL} \ + ${TORCHAUDIO_URL} \ + ${TORCHTEXT_URL} \ + ${TORCHDATA_URL} \ + torchtnt \ + s3torchconnector \ + accelerate + +# Install GDRCopy +RUN cd /tmp \ +&& git clone https://github.com/NVIDIA/gdrcopy.git -b v${GDRCOPY_VERSION} \ +&& cd gdrcopy \ +&& sed -ie '12s@$@ -L $(CUDA)/lib64/stubs@' tests/Makefile \ +&& CUDA=${CUDA_HOME} make install \ +&& rm -rf /tmp/gdrcopy + +# Install NCCL +RUN cd /tmp \ + && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \ + && cd nccl \ + && make -j64 src.build BUILDDIR=/usr/local \ + && rm -rf /tmp/nccl +# preload system nccl for PyTorch to use if it is dynamically linking NCCL +ENV LD_PRELOAD="/usr/local/lib/libnccl.so" + +# Install flash attn and NVIDIA transformer engine. +# Optionally set NVTE_FRAMEWORK to avoid bringing in additional frameworks during TE install +ENV NVTE_FRAMEWORK=pytorch +# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features +# Set MAX_JOBS=4 to avoid OOM issues in installation process +RUN MAX_JOBS=4 pip install --no-cache-dir flash-attn==${FLASH_ATTN_VERSION} --no-build-isolation --verbose +# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html +RUN pip install --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@release_v${TE_VERSION} --no-build-isolation + +# OSS compliance +RUN HOME_DIR=/root \ + && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ + && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ + && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ + && chmod +x /usr/local/bin/testOSSCompliance \ + && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ + && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ + && cp ${HOME_DIR}/oss_compliance/build_from_source_packages/BUILD_FROM_SOURCE_PACKAGES_LICENCES_AARCH64_IMAGES ${HOME_DIR} \ + && rm -rf ${HOME_DIR}/oss_compliance* + +# add license +RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.6/license.txt + +# add telemetry +COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py +COPY sitecustomize.py /usr/local/lib/${PYTHON_SHORT_VERSION}/sitecustomize.py +RUN chmod +x /usr/local/bin/deep_learning_container.py + +COPY start_cuda_compat.sh /usr/local/bin/start_cuda_compat.sh +RUN chmod +x /usr/local/bin/start_cuda_compat.sh + +COPY dockerd_entrypoint.sh /usr/local/bin/dockerd_entrypoint.sh +RUN chmod +x /usr/local/bin/dockerd_entrypoint.sh + +# Cleanup +RUN pip cache purge \ + && rm -rf /tmp/tmp* \ + && rm -iRf /root/.cache + +ENTRYPOINT ["bash", "-m", "dockerd_entrypoint.sh"] +CMD ["/bin/bash"] + +################################################################# +# ____ __ __ _ +# / ___| __ _ __ _ ___| \/ | __ _| | _____ _ __ +# \___ \ / _` |/ _` |/ _ \ |\/| |/ _` | |/ / _ \ '__| +# ___) | (_| | (_| | __/ | | | (_| | < __/ | +# |____/ \__,_|\__, |\___|_| |_|\__,_|_|\_\___|_| +# |___/ +# ___ ____ _ +# |_ _|_ __ ___ __ _ __ _ ___ | _ \ ___ ___(_)_ __ ___ +# | || '_ ` _ \ / _` |/ _` |/ _ \ | |_) / _ \/ __| | '_ \ / _ \ +# | || | | | | | (_| | (_| | __/ | _ < __/ (__| | |_) | __/ +# |___|_| |_| |_|\__,_|\__, |\___| |_| \_\___|\___|_| .__/ \___| +# |___/ |_| +# +################################################################# + +# FROM ec2 AS sagemaker + +# LABEL maintainer="Amazon AI" +# LABEL dlc_major_version="1" + +# ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main + +# ARG PYTHON + +# # Cleanup +# RUN pip cache purge +# && rm -rf /tmp/tmp* \ +# && rm -iRf /root/.cache diff --git a/test/dlc_tests/conftest.py b/test/dlc_tests/conftest.py index 03b98e54c2df..d3118761d852 100644 --- a/test/dlc_tests/conftest.py +++ b/test/dlc_tests/conftest.py @@ -62,6 +62,7 @@ "pytorch_training___1__13", "pytorch_training_habana", "pytorch_training_arm64", + "pytorch_training_arm64___2__7", "pytorch_inference", "pytorch_inference_eia", "pytorch_inference_neuron", @@ -348,11 +349,7 @@ def ec2_instance_role_name(request): @pytest.fixture(scope="function") def ec2_instance_ami(request, region): - return ( - request.param - if hasattr(request, "param") - else test_utils.get_instance_type_base_dlami(region) - ) + return request.param if hasattr(request, "param") else test_utils.get_dlami_id(region) @pytest.fixture(scope="function") @@ -1494,6 +1491,9 @@ def pytest_configure(config): config.addinivalue_line("markers", "skip_trcomp_containers(): mark test to skip on trcomp dlcs") config.addinivalue_line("markers", "deep_canary(): explicitly mark to run as deep canary test") config.addinivalue_line("markers", "team(team_name): mark tests that belong to a team") + config.addinivalue_line( + "markers", "skip_serialized_release_pt_test(): mark to skip test included in serial testing" + ) def pytest_runtest_setup(item): diff --git a/test/dlc_tests/ec2/pytorch/training/common_cases.py b/test/dlc_tests/ec2/pytorch/training/common_cases.py index 0207805cee32..8841b113e78d 100644 --- a/test/dlc_tests/ec2/pytorch/training/common_cases.py +++ b/test/dlc_tests/ec2/pytorch/training/common_cases.py @@ -57,6 +57,10 @@ filter_function=ec2_utils.filter_efa_instance_type, ) +PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type( + default="g5g.16xlarge", processor="gpu", arch_type="arm64" +) + def pytorch_standalone(pytorch_training, ec2_connection): """ diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py index bd61396585be..1cd7a8bba47c 100644 --- a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py @@ -52,6 +52,9 @@ PT_INDUCTOR_TEST_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.12xlarge", processor="gpu") PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g4dn.8xlarge", processor="gpu") +PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type( + default="g5g.8xlarge", processor="gpu", arch_type="arm64" +) PT_EC2_MULTI_GPU_NO_G_INSTANCE_TYPE = get_ec2_instance_type( default="g5.12xlarge", processor="gpu", @@ -114,6 +117,24 @@ def test_pytorch_train_mnist_cpu_deep_canary(pytorch_training, ec2_connection, c execute_ec2_training_test(ec2_connection, pytorch_training, PT_MNIST_CMD) +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.skipif( + not test_utils.is_deep_canary_context() or not os.getenv("REGION") == "us-west-2", + reason="This test only needs to run in deep-canary context in us-west-2", +) +@pytest.mark.deep_canary("Reason: This test is a simple pytorch training mnist test") +@pytest.mark.model("mnist") +@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +@pytest.mark.team("conda") +def test_pytorch_train_mnist_arm64_gpu_deep_canary( + pytorch_training_arm64, ec2_connection, gpu_only, ec2_instance_type +): + execute_ec2_training_test(ec2_connection, pytorch_training_arm64, PT_MNIST_CMD) + + @pytest.mark.parametrize("ec2_instance_ami", [test_utils.UL22_BASE_NEURON_US_WEST_2], indirect=True) @pytest.mark.parametrize("ec2_instance_type", PT_EC2_NEURON_TRN1_INSTANCE_TYPE, indirect=True) @pytest.mark.integration("pytorch_neuronx_sanity_test") diff --git a/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py new file mode 100644 index 000000000000..b9c88aa29398 --- /dev/null +++ b/test/dlc_tests/ec2/pytorch/training/test_pytorch_training_arm64_2_7.py @@ -0,0 +1,98 @@ +import pytest + +import test.test_utils as test_utils + +from test.test_utils import ec2 + +from test.dlc_tests.ec2.pytorch.training import common_cases +from test.dlc_tests.ec2 import smclarify_cases + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("pytorch_gpu_tests") +@pytest.mark.model("N/A") +@pytest.mark.team("conda") +@pytest.mark.parametrize( + "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True +) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_2_7_gpu( + pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training_arm64___2__7 + + test_cases = [ + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), + # (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), + (common_cases.pytorch_training_torchdata, (pytorch_training, ec2_connection)), + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), + ] + + if "sagemaker" in pytorch_training: + test_cases.append( + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), + ) + + # AMP must be run on multi_gpu + if ec2.is_instance_multi_gpu(ec2_instance_type): + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.7 GPU") + + +# @pytest.mark.usefixtures("sagemaker") +# @pytest.mark.integration("pytorch_gpu_heavy_tests") +# @pytest.mark.model("N/A") +# @pytest.mark.team("conda") +# @pytest.mark.parametrize( +# "ec2_instance_type", common_cases.PT_EC2_HEAVY_GPU_ARM64_INSTANCE_TYPE, indirect=True +# ) +# @pytest.mark.parametrize( +# "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +# ) +# @pytest.mark.skipif( +# test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), +# reason="Skip GPU Heavy tests in PR context unless explicitly enabled", +# ) +# def test_pytorch_2_7_gpu_heavy( +# pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +# ): +# pytorch_training = pytorch_training_arm64___2__7 + +# test_cases = [ +# (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), +# (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), +# ] + +# test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.7 GPU Heavy") + + +@pytest.mark.usefixtures("sagemaker") +@pytest.mark.integration("inductor") +@pytest.mark.model("N/A") +@pytest.mark.team("training-compiler") +@pytest.mark.parametrize( + "ec2_instance_type", common_cases.PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True +) +@pytest.mark.parametrize( + "ec2_instance_ami", [test_utils.AL2023_BASE_DLAMI_ARM64_US_WEST_2], indirect=True +) +def test_pytorch_2_7_gpu_inductor( + pytorch_training_arm64___2__7, ec2_connection, region, gpu_only, ec2_instance_type +): + pytorch_training = pytorch_training_arm64___2__7 + + test_cases = [ + # (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), + ] + + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.7 GPU Inductor") diff --git a/test/dlc_tests/ec2/test_curand.py b/test/dlc_tests/ec2/test_curand.py index f9e54100b92d..17d3b2be03ee 100644 --- a/test/dlc_tests/ec2/test_curand.py +++ b/test/dlc_tests/ec2/test_curand.py @@ -24,6 +24,6 @@ def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(training, ec2_instance_type): pytest.skip(f"Image {training} is incompatible with instance type {ec2_instance_type}") - if is_tf_version("1", training) or "mxnet" in training: - pytest.skip("Test is not configured for TF1 and MXNet") + if is_tf_version("1", training) or "mxnet" in training or "arm64" in training: + pytest.skip("Test is not configured for TF1 and MXNet and ARM64") execute_ec2_training_test(ec2_connection, training, CURAND_CMD) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 1df85a79850c..61751c935994 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -119,7 +119,6 @@ def get_ami_id_ssm(region_name, parameter_path): region_name="us-east-1", parameter_path="/aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id", ) -# We use the following DLAMI for MXNet and TensorFlow tests as well, but this is ok since we use custom DLC Graviton containers on top. We just need an ARM base DLAMI. AL2023_BASE_DLAMI_ARM64_US_WEST_2 = get_ami_id_ssm( region_name="us-west-2", parameter_path="/aws/service/deeplearning/ami/arm64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id ", @@ -1648,7 +1647,7 @@ def setup_sm_benchmark_tf_train_env(resources_location, setup_tf1_env, setup_tf2 ).stdout.strip("\n") system = ctx.run("uname -s").stdout.strip("\n") sed_input_arg = "'' " if system == "Darwin" else "" - ctx.run(f"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}") + ctx.run(rf"sed -i {sed_input_arg}'s/\[2, 1, 0\]/\[2, 1, 1\]/g' {estimator_location}") return venv_dir @@ -2494,7 +2493,7 @@ def get_image_spec_from_buildspec(image_uri, dlc_folder_path): return matched_image_spec -def get_instance_type_base_dlami(region): +def get_dlami_id(region): """ Returns the appropriate base DLAMI based on region. Args: diff --git a/test/test_utils/sagemaker.py b/test/test_utils/sagemaker.py index cdf653854420..6dadcfff0aca 100644 --- a/test/test_utils/sagemaker.py +++ b/test/test_utils/sagemaker.py @@ -32,7 +32,7 @@ AL2023_HOME_DIR, DEFAULT_REGION, is_nightly_context, - get_instance_type_base_dlami, + get_dlami_id, login_to_ecr_registry, ) from test_utils.pytest_cache import PytestCache @@ -101,7 +101,7 @@ def assign_sagemaker_local_test_ami(image, region): else: return AL2023_BASE_DLAMI_ARM64_US_WEST_2 else: - return get_instance_type_base_dlami(region) + return get_dlami_id(region) def launch_sagemaker_local_ec2_instance(image, ec2_key_name, region):