Skip to content

Commit 0ef9be6

Browse files
authored
Merge pull request #282 from aws-samples/smhp-easy-setup
add a script for easy cluster setup
2 parents 2960173 + ed3c0af commit 0ef9be6

File tree

1 file changed

+338
-0
lines changed

1 file changed

+338
-0
lines changed
Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
#!/bin/bash
2+
3+
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
4+
# SPDX-License-Identifier: MIT-0
5+
#
6+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
7+
# software and associated documentation files (the "Software"), to deal in the Software
8+
# without restriction, including without limitation the rights to use, copy, modify,
9+
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
10+
# permit persons to whom the Software is furnished to do so.
11+
#
12+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13+
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14+
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
15+
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
16+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
17+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
18+
set -exo pipefail
19+
: "${STACK_ID_VPC:=sagemaker-hyperpod}"
20+
21+
declare -a HELP=(
22+
"[-h|--help]"
23+
"[-r|--region]"
24+
"[-p|--profile]"
25+
"[-s|--stack-id-vpc]"
26+
"[-i|--instance-type]"
27+
"[-a|--availability-zone]"
28+
"[-d|--dry-run]"
29+
"CLUSTER_NAME"
30+
)
31+
declare -a aws_cli_args=()
32+
DRY_RUN=0
33+
34+
35+
parse_args() {
36+
local key
37+
while [[ $# -gt 0 ]]; do
38+
key="$1"
39+
case $key in
40+
-h|--help)
41+
echo "Create a HyperPod Cluster with single partition."
42+
echo "It requires sageamker-hyperpod CloudFormation stack to be deployed."
43+
echo "Usage: $(basename ${BASH_SOURCE[0]}) ${HELP[@]}"
44+
exit 0
45+
;;
46+
-r|--region)
47+
aws_cli_args+=(--region "$2")
48+
AWS_REGION="$2"
49+
shift 2
50+
;;
51+
-p|--profile)
52+
aws_cli_args+=(--profile "$2")
53+
shift 2
54+
;;
55+
-s|--stack-id-vpc)
56+
STACK_ID_VPC="$2"
57+
shift 2
58+
;;
59+
-i|--instance-type)
60+
INSTANCE="$2"
61+
shift 2
62+
;;
63+
-c|--instance-count)
64+
INSTANCE_COUNTS="$2"
65+
shift 2
66+
;;
67+
-i|--availability-zone)
68+
AZ="$2"
69+
shift 2
70+
;;
71+
-d|--dry-run)
72+
DRY_RUN=1
73+
shift
74+
;;
75+
*)
76+
CLUSTER_NAME="$key"
77+
shift
78+
;;
79+
esac
80+
done
81+
}
82+
83+
parse_args $@
84+
85+
mkdir $CLUSTER_NAME && cd $CLUSTER_NAME
86+
87+
# Check for AWS CLI
88+
if ! command -v aws &> /dev/null
89+
then
90+
echo -e "please install aws..."
91+
echo -e "see https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for the installation guide"
92+
exit 1
93+
fi
94+
95+
# Check for JQ
96+
if ! command -v jq &> /dev/null
97+
then
98+
echo -e "please install jq...\nsudo yum install -y jq or brew install jq"
99+
exit 1
100+
fi
101+
102+
# Define cluster name
103+
if [ -z ${CLUSTER_NAME} ]; then
104+
echo "[WARNING] CLUSTER_NAME environment variable is not set, automatically set to ml-cluster"
105+
CLUSTER_NAME=ml-cluster
106+
fi
107+
108+
# Define stack name
109+
if [ -z ${STACK_ID_VPC} ]; then
110+
echo "[WARNING] STACK_ID_VPC environment variable is not set, automatically set to sagemaker-hyperpod"
111+
STACK_ID_VPC=sagemaker-hyperpod
112+
fi
113+
114+
# Define AWS Region
115+
if [ -z ${AWS_REGION} ]; then
116+
echo "[WARNING] AWS_REGION environment variable is not set, automatically set depending on aws cli default region."
117+
export AWS_REGION=$(aws configure get region)
118+
fi
119+
echo "export AWS_REGION=${AWS_REGION}" >> env_vars
120+
echo "[INFO] AWS_REGION = ${AWS_REGION}"
121+
122+
# Define Instances seperated by ','.
123+
if [ -z ${INSTANCE} ]; then
124+
echo "[WARNING] INSTANCE environment variable is not set, automatically set to g5.12xlarge."
125+
export INSTANCE=g5.12xlarge
126+
fi
127+
echo "export INSTANCE=${INSTANCE}" >> env_vars
128+
echo "[INFO] INSTANCE = ${INSTANCE}"
129+
130+
# Define Instance counts seperated by ','.
131+
if [ -z ${INSTANCE_COUNT} ]; then
132+
echo "[WARNING] INSTANCE_COUNTS environment variable is not set, automatically set to 2."
133+
export INSTANCE_COUNT=2
134+
fi
135+
136+
# Retrieve VPC ID
137+
export VPC_ID=`aws cloudformation describe-stacks \
138+
--stack-name $STACK_ID_VPC \
139+
--query 'Stacks[0].Outputs[?OutputKey==\`VPC\`].OutputValue' \
140+
--region ${AWS_REGION} \
141+
--output text`
142+
143+
if [[ ! -z $VPC_ID ]]; then
144+
echo "export VPC_ID=${VPC_ID}" >> env_vars
145+
echo "[INFO] VPC_ID = ${VPC_ID}"
146+
else
147+
echo "[ERROR] failed to retrieve VPC ID"
148+
return 1
149+
fi
150+
151+
# Grab the subnet id
152+
export SUBNET_ID=`aws cloudformation describe-stacks \
153+
--stack-name $STACK_ID_VPC \
154+
--query 'Stacks[0].Outputs[?OutputKey==\`PrimaryPrivateSubnet\`].OutputValue' \
155+
--region ${AWS_REGION} \
156+
--output text`
157+
158+
if [[ ! -z $SUBNET_ID ]]; then
159+
echo "export SUBNET_ID=${SUBNET_ID}" >> env_vars
160+
echo "[INFO] SUBNET_ID = ${SUBNET_ID}"
161+
else
162+
echo "[ERROR] failed to retrieve SUBNET ID"
163+
return 1
164+
fi
165+
166+
# Grab the subnet id
167+
export PUBLIC_SUBNET_ID=`aws cloudformation describe-stacks \
168+
--stack-name $STACK_ID_VPC \
169+
--query 'Stacks[0].Outputs[?OutputKey==\`PublicSubnet\`].OutputValue' \
170+
--region ${AWS_REGION} \
171+
--output text`
172+
173+
if [[ ! -z $PUBLIC_SUBNET_ID ]]; then
174+
echo "export PUBLIC_SUBNET_ID=${PUBLIC_SUBNET_ID}" >> env_vars
175+
echo "[INFO] PUBLIC_SUBNET_ID = ${PUBLIC_SUBNET_ID}"
176+
else
177+
echo "[ERROR] failed to retrieve Public SUBNET ID"
178+
return 1
179+
fi
180+
181+
# Get FSx Filesystem id from CloudFormation
182+
export FSX_ID=`aws cloudformation describe-stacks \
183+
--stack-name $STACK_ID_VPC \
184+
--query 'Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemId\`].OutputValue' \
185+
--region ${AWS_REGION} \
186+
--output text`
187+
188+
if [[ ! -z $FSX_ID ]]; then
189+
echo "export FSX_ID=${FSX_ID}" >> env_vars
190+
echo "[INFO] FSX_ID = ${FSX_ID}"
191+
else
192+
echo "[ERROR] failed to retrieve FSX ID"
193+
return 1
194+
fi
195+
196+
# Get FSx Filesystem Mountname from CloudFormation
197+
export FSX_MOUNTNAME=`aws cloudformation describe-stacks \
198+
--stack-name $STACK_ID_VPC \
199+
--query 'Stacks[0].Outputs[?OutputKey==\`FSxLustreFilesystemMountname\`].OutputValue' \
200+
--region ${AWS_REGION} \
201+
--output text`
202+
203+
if [[ ! -z $FSX_MOUNTNAME ]]; then
204+
echo "export FSX_MOUNTNAME=${FSX_MOUNTNAME}" >> env_vars
205+
echo "[INFO] FSX_MOUNTNAME = ${FSX_MOUNTNAME}"
206+
else
207+
echo "[ERROR] failed to retrieve FSX Mountname"
208+
return 1
209+
fi
210+
211+
# Get FSx Security Group from CloudFormation
212+
export SECURITY_GROUP=`aws cloudformation describe-stacks \
213+
--stack-name $STACK_ID_VPC \
214+
--query 'Stacks[0].Outputs[?OutputKey==\`SecurityGroup\`].OutputValue' \
215+
--region ${AWS_REGION} \
216+
--output text`
217+
218+
if [[ ! -z $SECURITY_GROUP ]]; then
219+
echo "export SECURITY_GROUP=${SECURITY_GROUP}" >> env_vars
220+
echo "[INFO] SECURITY_GROUP = ${SECURITY_GROUP}"
221+
else
222+
echo "[ERROR] failed to retrieve FSX Security Group"
223+
return 1
224+
fi
225+
226+
# Get sagemaker role ARN
227+
export ROLE=`aws cloudformation describe-stacks \
228+
--stack-name $STACK_ID_VPC \
229+
--query 'Stacks[0].Outputs[?OutputKey==\`AmazonSagemakerClusterExecutionRoleArn\`].OutputValue' \
230+
--region ${AWS_REGION} \
231+
--output text`
232+
233+
if [[ ! -z $ROLE ]]; then
234+
echo "export ROLE=${ROLE}" >> env_vars
235+
echo "[INFO] ROLE = ${ROLE}"
236+
else
237+
echo "[ERROR] failed to retrieve Role ARN"
238+
return 1
239+
fi
240+
241+
# Get sagemaker role ROLENAME
242+
export ROLENAME=$(basename "$ROLE")
243+
244+
if [[ ! -z $ROLENAME ]]; then
245+
echo "export ROLENAME=${ROLENAME}" >> env_vars
246+
echo "[INFO] ROLENAME = ${ROLENAME}"
247+
else
248+
echo "[ERROR] failed to retrieve Role NAME"
249+
return 1
250+
fi
251+
252+
# Get s3 bucket name
253+
export BUCKET=`aws cloudformation describe-stacks \
254+
--stack-name $STACK_ID_VPC \
255+
--query 'Stacks[0].Outputs[?OutputKey==\`AmazonS3BucketName\`].OutputValue' \
256+
--region ${AWS_REGION} \
257+
--output text`
258+
259+
if [[ ! -z $BUCKET ]]; then
260+
echo "export BUCKET=${BUCKET}" >> env_vars
261+
echo "[INFO] BUCKET = ${BUCKET}"
262+
else
263+
echo "[ERROR] failed to retrieve Bucket Name"
264+
return 1
265+
fi
266+
267+
268+
git clone --depth=1 https://github.com/aws-samples/awsome-distributed-training/
269+
# Use pushd and popd to navigate directories https://en.wikipedia.org/wiki/Pushd_and_popd
270+
pushd awsome-distributed-training/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/
271+
# upload data
272+
aws s3 cp --recursive base-config/ s3://${BUCKET}/src
273+
# move back to the previous directory
274+
popd
275+
276+
cat > provisioning_parameters.json << EOL
277+
{
278+
"version": "1.0.0",
279+
"workload_manager": "slurm",
280+
"controller_group": "controller-machine",
281+
"worker_groups": [
282+
{
283+
"instance_group_name": "worker-group-1",
284+
"partition_name": ${INSTANCE}
285+
}
286+
],
287+
"fsx_dns_name": "${FSX_ID}.fsx.${AWS_REGION}.amazonaws.com",
288+
"fsx_mountname": "${FSX_MOUNTNAME}"
289+
}
290+
EOL
291+
292+
# copy to the S3 Bucket
293+
aws s3 cp provisioning_parameters.json s3://${BUCKET}/src/
294+
295+
cat > cluster-config.json << EOL
296+
{
297+
"ClusterName": "${CLUSTER_NAME}",
298+
"InstanceGroups": [
299+
{
300+
"InstanceGroupName": "controller-machine",
301+
"InstanceType": "ml.m5.12xlarge",
302+
"InstanceCount": 1,
303+
"LifeCycleConfig": {
304+
"SourceS3Uri": "s3://${BUCKET}/src",
305+
"OnCreate": "on_create.sh"
306+
},
307+
"ExecutionRole": "${ROLE}",
308+
"ThreadsPerCore": 1
309+
},
310+
{
311+
"InstanceGroupName": "worker-group-1",
312+
"InstanceType": "ml.${INSTANCE}",
313+
"InstanceCount": ${INSTANCE_COUNT},
314+
"LifeCycleConfig": {
315+
"SourceS3Uri": "s3://${BUCKET}/src",
316+
"OnCreate": "on_create.sh"
317+
},
318+
"ExecutionRole": "${ROLE}",
319+
"ThreadsPerCore": 1
320+
}
321+
],
322+
"VpcConfig": {
323+
"SecurityGroupIds": ["$SECURITY_GROUP"],
324+
"Subnets":["$SUBNET_ID"]
325+
}
326+
}
327+
EOL
328+
329+
# Validate Cluster configuration
330+
wget https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/main/1.architectures/5.sagemaker-hyperpod/validate-config.py
331+
# install boto3
332+
pip3 install boto3
333+
# check config for known issues
334+
python3 validate-config.py --cluster-config cluster-config.json --provisioning-parameters provisioning_parameters.json
335+
336+
echo "aws sagemaker create-cluster --cli-input-json file://cluster-config.json --region ${REGION}"
337+
[[ DRY_RUN -eq 1 ]] && exit 0
338+
aws sagemaker create-cluster --cli-input-json "file://cluster-config.json" --region ${REGION}

0 commit comments

Comments
 (0)