1+ #! /bin/bash
2+
3+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+ # SPDX-License-Identifier: MIT-0
5+ #
6+ # Permission is hereby granted, free of charge, to any person obtaining a copy of this
7+ # software and associated documentation files (the "Software"), to deal in the Software
8+ # without restriction, including without limitation the rights to use, copy, modify,
9+ # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10+ # permit persons to whom the Software is furnished to do so.
11+ #
12+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13+ # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14+ # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15+ # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+ set -exo pipefail
19+ : " ${STACK_ID_VPC:= sagemaker-hyperpod} "
20+
21+ declare -a HELP=(
22+ " [-h|--help]"
23+ " [-r|--region]"
24+ " [-p|--profile]"
25+ " [-s|--stack-id-vpc]"
26+ " [-i|--instance-type]"
27+ " [-a|--availability-zone]"
28+ " [-d|--dry-run]"
29+ " CLUSTER_NAME"
30+ )
31+ declare -a aws_cli_args=()
32+ DRY_RUN=0
33+
34+
35+ parse_args () {
36+ local key
37+ while [[ $# -gt 0 ]]; do
38+ key=" $1 "
39+ case $key in
40+ -h|--help)
41+ echo " Create a HyperPod Cluster with single partition."
42+ echo " It requires sageamker-hyperpod CloudFormation stack to be deployed."
43+ echo " Usage: $( basename ${BASH_SOURCE[0]} ) ${HELP[@]} "
44+ exit 0
45+ ;;
46+ -r|--region)
47+ aws_cli_args+=(--region " $2 " )
48+ AWS_REGION=" $2 "
49+ shift 2
50+ ;;
51+ -p|--profile)
52+ aws_cli_args+=(--profile " $2 " )
53+ shift 2
54+ ;;
55+ -s|--stack-id-vpc)
56+ STACK_ID_VPC=" $2 "
57+ shift 2
58+ ;;
59+ -i|--instance-type)
60+ INSTANCE=" $2 "
61+ shift 2
62+ ;;
63+ -c|--instance-count)
64+ INSTANCE_COUNTS=" $2 "
65+ shift 2
66+ ;;
67+ -i|--availability-zone)
68+ AZ=" $2 "
69+ shift 2
70+ ;;
71+ -d|--dry-run)
72+ DRY_RUN=1
73+ shift
74+ ;;
75+ * )
76+ CLUSTER_NAME=" $key "
77+ shift
78+ ;;
79+ esac
80+ done
81+ }
82+
83+ parse_args $@
84+
85+ mkdir $CLUSTER_NAME && cd $CLUSTER_NAME
86+
87+ # Check for AWS CLI
88+ if ! command -v aws & > /dev/null
89+ then
90+ echo -e " please install aws..."
91+ echo -e " see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for the installation guide"
92+ exit 1
93+ fi
94+
95+ # Check for JQ
96+ if ! command -v jq & > /dev/null
97+ then
98+ echo -e " please install jq...\nsudo yum install -y jq or brew install jq"
99+ exit 1
100+ fi
101+
102+ # Define cluster name
103+ if [ -z ${CLUSTER_NAME} ]; then
104+ echo " [WARNING] CLUSTER_NAME environment variable is not set, automatically set to ml-cluster"
105+ CLUSTER_NAME=ml-cluster
106+ fi
107+
108+ # Define stack name
109+ if [ -z ${STACK_ID_VPC} ]; then
110+ echo " [WARNING] STACK_ID_VPC environment variable is not set, automatically set to sagemaker-hyperpod"
111+ STACK_ID_VPC=sagemaker-hyperpod
112+ fi
113+
114+ # Define AWS Region
115+ if [ -z ${AWS_REGION} ]; then
116+ echo " [WARNING] AWS_REGION environment variable is not set, automatically set depending on aws cli default region."
117+ export AWS_REGION=$( aws configure get region)
118+ fi
119+ echo " export AWS_REGION=${AWS_REGION} " >> env_vars
120+ echo " [INFO] AWS_REGION = ${AWS_REGION} "
121+
122+ # Define Instances seperated by ','.
123+ if [ -z ${INSTANCE} ]; then
124+ echo " [WARNING] INSTANCE environment variable is not set, automatically set to g5.12xlarge."
125+ export INSTANCE=g5.12xlarge
126+ fi
127+ echo " export INSTANCE=${INSTANCE} " >> env_vars
128+ echo " [INFO] INSTANCE = ${INSTANCE} "
129+
130+ # Define Instance counts seperated by ','.
131+ if [ -z ${INSTANCE_COUNT} ]; then
132+ echo " [WARNING] INSTANCE_COUNTS environment variable is not set, automatically set to 2."
133+ export INSTANCE_COUNT=2
134+ fi
135+
136+ # Retrieve VPC ID
137+ export VPC_ID=` aws cloudformation describe-stacks \
138+ --stack-name $STACK_ID_VPC \
139+ --query ' Stacks[0].Outputs[?OutputKey==\`VPC\`].OutputValue' \
140+ --region ${AWS_REGION} \
141+ --output text`
142+
143+ if [[ ! -z $VPC_ID ]]; then
144+ echo " export VPC_ID=${VPC_ID} " >> env_vars
145+ echo " [INFO] VPC_ID = ${VPC_ID} "
146+ else
147+ echo " [ERROR] failed to retrieve VPC ID"
148+ return 1
149+ fi
150+
151+ # Grab the subnet id
152+ export SUBNET_ID=` aws cloudformation describe-stacks \
153+ --stack-name $STACK_ID_VPC \
154+ --query ' Stacks[0].Outputs[?OutputKey==\`PrimaryPrivateSubnet\`].OutputValue' \
155+ --region ${AWS_REGION} \
156+ --output text`
157+
158+ if [[ ! -z $SUBNET_ID ]]; then
159+ echo " export SUBNET_ID=${SUBNET_ID} " >> env_vars
160+ echo " [INFO] SUBNET_ID = ${SUBNET_ID} "
161+ else
162+ echo " [ERROR] failed to retrieve SUBNET ID"
163+ return 1
164+ fi
165+
166+ # Grab the subnet id
167+ export PUBLIC_SUBNET_ID=` aws cloudformation describe-stacks \
168+ --stack-name $STACK_ID_VPC \
169+ --query ' Stacks[0].Outputs[?OutputKey==\`PublicSubnet\`].OutputValue' \
170+ --region ${AWS_REGION} \
171+ --output text`
172+
173+ if [[ ! -z $PUBLIC_SUBNET_ID ]]; then
174+ echo " export PUBLIC_SUBNET_ID=${PUBLIC_SUBNET_ID} " >> env_vars
175+ echo " [INFO] PUBLIC_SUBNET_ID = ${PUBLIC_SUBNET_ID} "
176+ else
177+ echo " [ERROR] failed to retrieve Public SUBNET ID"
178+ return 1
179+ fi
180+
181+ # Get FSx Filesystem id from CloudFormation
182+ export FSX_ID=` aws cloudformation describe-stacks \
183+ --stack-name $STACK_ID_VPC \
184+ --query ' Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemId\`].OutputValue' \
185+ --region ${AWS_REGION} \
186+ --output text`
187+
188+ if [[ ! -z $FSX_ID ]]; then
189+ echo " export FSX_ID=${FSX_ID} " >> env_vars
190+ echo " [INFO] FSX_ID = ${FSX_ID} "
191+ else
192+ echo " [ERROR] failed to retrieve FSX ID"
193+ return 1
194+ fi
195+
196+ # Get FSx Filesystem Mountname from CloudFormation
197+ export FSX_MOUNTNAME=` aws cloudformation describe-stacks \
198+ --stack-name $STACK_ID_VPC \
199+ --query ' Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemMountname\`].OutputValue' \
200+ --region ${AWS_REGION} \
201+ --output text`
202+
203+ if [[ ! -z $FSX_MOUNTNAME ]]; then
204+ echo " export FSX_MOUNTNAME=${FSX_MOUNTNAME} " >> env_vars
205+ echo " [INFO] FSX_MOUNTNAME = ${FSX_MOUNTNAME} "
206+ else
207+ echo " [ERROR] failed to retrieve FSX Mountname"
208+ return 1
209+ fi
210+
211+ # Get FSx Security Group from CloudFormation
212+ export SECURITY_GROUP=` aws cloudformation describe-stacks \
213+ --stack-name $STACK_ID_VPC \
214+ --query ' Stacks[0].Outputs[?OutputKey==\`SecurityGroup\`].OutputValue' \
215+ --region ${AWS_REGION} \
216+ --output text`
217+
218+ if [[ ! -z $SECURITY_GROUP ]]; then
219+ echo " export SECURITY_GROUP=${SECURITY_GROUP} " >> env_vars
220+ echo " [INFO] SECURITY_GROUP = ${SECURITY_GROUP} "
221+ else
222+ echo " [ERROR] failed to retrieve FSX Security Group"
223+ return 1
224+ fi
225+
226+ # Get sagemaker role ARN
227+ export ROLE=` aws cloudformation describe-stacks \
228+ --stack-name $STACK_ID_VPC \
229+ --query ' Stacks[0].Outputs[?OutputKey==\`AmazonSagemakerClusterExecutionRoleArn\`].OutputValue' \
230+ --region ${AWS_REGION} \
231+ --output text`
232+
233+ if [[ ! -z $ROLE ]]; then
234+ echo " export ROLE=${ROLE} " >> env_vars
235+ echo " [INFO] ROLE = ${ROLE} "
236+ else
237+ echo " [ERROR] failed to retrieve Role ARN"
238+ return 1
239+ fi
240+
241+ # Get sagemaker role ROLENAME
242+ export ROLENAME=$( basename " $ROLE " )
243+
244+ if [[ ! -z $ROLENAME ]]; then
245+ echo " export ROLENAME=${ROLENAME} " >> env_vars
246+ echo " [INFO] ROLENAME = ${ROLENAME} "
247+ else
248+ echo " [ERROR] failed to retrieve Role NAME"
249+ return 1
250+ fi
251+
252+ # Get s3 bucket name
253+ export BUCKET=` aws cloudformation describe-stacks \
254+ --stack-name $STACK_ID_VPC \
255+ --query ' Stacks[0].Outputs[?OutputKey==\`AmazonS3BucketName\`].OutputValue' \
256+ --region ${AWS_REGION} \
257+ --output text`
258+
259+ if [[ ! -z $BUCKET ]]; then
260+ echo " export BUCKET=${BUCKET} " >> env_vars
261+ echo " [INFO] BUCKET = ${BUCKET} "
262+ else
263+ echo " [ERROR] failed to retrieve Bucket Name"
264+ return 1
265+ fi
266+
267+
268+ git clone --depth=1 https://github.com/aws-samples/awsome-distributed-training/
269+ # Use pushd and popd to navigate directories https://en.wikipedia.org/wiki/Pushd_and_popd
270+ pushd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/
271+ # upload data
272+ aws s3 cp --recursive base-config/ s3://${BUCKET} /src
273+ # move back to the previous directory
274+ popd
275+
276+ cat > provisioning_parameters.json << EOL
277+ {
278+ "version": "1.0.0",
279+ "workload_manager": "slurm",
280+ "controller_group": "controller-machine",
281+ "worker_groups": [
282+ {
283+ "instance_group_name": "worker-group-1",
284+ "partition_name": ${INSTANCE}
285+ }
286+ ],
287+ "fsx_dns_name": "${FSX_ID} .fsx.${AWS_REGION} .amazonaws.com",
288+ "fsx_mountname": "${FSX_MOUNTNAME} "
289+ }
290+ EOL
291+
292+ # copy to the S3 Bucket
293+ aws s3 cp provisioning_parameters.json s3://${BUCKET} /src/
294+
295+ cat > cluster-config.json << EOL
296+ {
297+ "ClusterName": "${CLUSTER_NAME} ",
298+ "InstanceGroups": [
299+ {
300+ "InstanceGroupName": "controller-machine",
301+ "InstanceType": "ml.m5.12xlarge",
302+ "InstanceCount": 1,
303+ "LifeCycleConfig": {
304+ "SourceS3Uri": "s3://${BUCKET} /src",
305+ "OnCreate": "on_create.sh"
306+ },
307+ "ExecutionRole": "${ROLE} ",
308+ "ThreadsPerCore": 1
309+ },
310+ {
311+ "InstanceGroupName": "worker-group-1",
312+ "InstanceType": "ml.${INSTANCE} ",
313+ "InstanceCount": ${INSTANCE_COUNT} ,
314+ "LifeCycleConfig": {
315+ "SourceS3Uri": "s3://${BUCKET} /src",
316+ "OnCreate": "on_create.sh"
317+ },
318+ "ExecutionRole": "${ROLE} ",
319+ "ThreadsPerCore": 1
320+ }
321+ ],
322+ "VpcConfig": {
323+ "SecurityGroupIds": ["$SECURITY_GROUP "],
324+ "Subnets":["$SUBNET_ID "]
325+ }
326+ }
327+ EOL
328+
329+ # Validate Cluster configuration
330+ wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/validate-config.py
331+ # install boto3
332+ pip3 install boto3
333+ # check config for known issues
334+ python3 validate-config.py --cluster-config cluster-config.json --provisioning-parameters provisioning_parameters.json
335+
336+ echo " aws sagemaker create-cluster --cli-input-json file://cluster-config.json --region ${REGION} "
337+ [[ DRY_RUN -eq 1 ]] && exit 0
338+ aws sagemaker create-cluster --cli-input-json " file://cluster-config.json" --region ${REGION}
0 commit comments