Skip to content

[PyTorch][ARM64][Training][EC2] PT 2.7.0 Release #4757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions dlc_developer_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ neuronx_mode = false
graviton_mode = false
# Please only set it to true if you are preparing a ARM64 related PR
# Do remember to revert it back to false before merging any PR (including ARM64 dedicated PR)
arm64_mode = false
arm64_mode = true
# Please only set it to True if you are preparing a HABANA related PR
# Do remember to revert it back to False before merging any PR (including HABANA dedicated PR)
habana_mode = false
Expand All @@ -37,16 +37,16 @@ deep_canary_mode = false
[build]
# Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image.
# available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"]
build_frameworks = []
build_frameworks = ["pytorch"]


# By default we build both training and inference containers. Set true/false values to determine which to build.
build_training = true
build_inference = true
build_inference = false

# Set do_build to "false" to skip builds and test the latest image built by this PR
# Note: at least one build is required to set do_build to "false"
do_build = true
do_build = false

[notify]
### Notify on test failures
Expand All @@ -57,12 +57,12 @@ notify_test_failures = false

[test]
### On by default
sanity_tests = true
security_tests = true
sanity_tests = false
security_tests = false
safety_check_test = false
ecr_scan_allowlist_feature = false
ecs_tests = true
eks_tests = true
ecs_tests = false
eks_tests = false
ec2_tests = true
# Set it to true if you are preparing a Benchmark related PR
ec2_benchmark_tests = false
Expand All @@ -74,10 +74,10 @@ ec2_benchmark_tests = false
ec2_tests_on_heavy_instances = false
### SM specific tests
### On by default
sagemaker_local_tests = true
sagemaker_local_tests = false

# run standard sagemaker remote tests from test/sagemaker_tests
sagemaker_remote_tests = true
sagemaker_remote_tests = false
# run efa sagemaker tests
sagemaker_efa_tests = false
# run release_candidate_integration tests
Expand Down
72 changes: 72 additions & 0 deletions pytorch/training/buildspec-arm64-2-7-ec2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
prod_account_id: &PROD_ACCOUNT_ID 763104351884
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK pytorch
version: &VERSION 2.7.0
short_version: &SHORT_VERSION "2.7"
arch_type: arm64
# autopatch_build: "True"

repository_info:
training_repository: &TRAINING_REPOSITORY
image_type: &TRAINING_IMAGE_TYPE training
root: !join [ *FRAMEWORK, "/", *TRAINING_IMAGE_TYPE ]
repository_name: &REPOSITORY_NAME !join [ pr, "-", *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
repository: &REPOSITORY !join [ *ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *REPOSITORY_NAME ]
release_repository_name: &RELEASE_REPOSITORY_NAME !join [ *FRAMEWORK, "-", *TRAINING_IMAGE_TYPE, "-", arm64 ]
release_repository: &RELEASE_REPOSITORY !join [ *PROD_ACCOUNT_ID, .dkr.ecr., *REGION, .amazonaws.com/, *RELEASE_REPOSITORY_NAME ]

context:
training_context: &TRAINING_CONTEXT
start_cuda_compat:
source: docker/build_artifacts/start_cuda_compat.sh
target: start_cuda_compat.sh
dockerd_entrypoint:
source: docker/build_artifacts/dockerd_entrypoint.sh
target: dockerd_entrypoint.sh
changehostname:
source: docker/build_artifacts/changehostname.c
target: changehostname.c
start_with_right_hostname:
source: docker/build_artifacts/start_with_right_hostname.sh
target: start_with_right_hostname.sh
example_mnist_file:
source: docker/build_artifacts/mnist.py
target: mnist.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
# BuildEC2Arm64CPUPTTrainPy3DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_CPU_TRAINING_PY3 false
# image_size_baseline: 6500
# device_type: &DEVICE_TYPE cpu
# python_version: &DOCKER_PYTHON_VERSION py3
# tag_python_version: &TAG_PYTHON_VERSION py312
# os_version: &OS_VERSION ubuntu22.04
# tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# # build_tag_override: "beta:2.6.0-cpu-py311-ubuntu22.04-ec2"
# docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.arm64., *DEVICE_TYPE ]
# target: ec2
# context:
# <<: *TRAINING_CONTEXT
BuildEC2Arm64GPUPTTrainPy3cu126DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 19700
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py312
cuda_version: &CUDA_VERSION cu128
os_version: &OS_VERSION ubuntu22.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.6.0-gpu-py312-cu126-ubuntu22.04-ec2"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.arm64.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
2 changes: 1 addition & 1 deletion pytorch/training/buildspec-arm64.yml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
buildspec_pointer: buildspec-arm64-2-5-ec2.yml
buildspec_pointer: buildspec-arm64-2-7-ec2.yml
Loading