-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_scispark_emr.sh
executable file
·110 lines (94 loc) · 4.05 KB
/
run_scispark_emr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
# This script will run a toy Spark example on Amazon EMR as described here:
# https://aws.amazon.com/blogs/aws/new-apache-spark-on-amazon-emr/
# The sample data sits on S3 in us-east-1 so run this script there.
# what to do if a step fails: TERMINATE_CLUSTER, CANCEL_AND_WAIT
STEP_FAILURE_ACTION=TERMINATE_CLUSTER
# uncomment to termination protection on
TERMINATION_PROTECTED=--termination-protected
# uncomment to auto terminate
#AUTO_TERMINATE=--auto-terminate
# number of workers
WORKERS=4
# subnet id
SUBNET_ID=subnet-6713b04d
# master & slave security group id
MASTER_SG_ID=sg-e571c99d
SLAVE_SG_ID=sg-6d73cb15
# other security groups
JPL_SG_ID=sg-0c76ce74
DEV_TEAM_SG_ID=sg-fb26a283
# S3 URI for logs
LOG_URI=s3n://aws-logs-052078438257-us-east-1/elasticmapreduce/
# create cluster
json=$(aws emr create-cluster $TERMINATION_PROTECTED $AUTO_TERMINATE \
--applications Name=Hadoop Name=Hive Name=Spark Name=Zeppelin-Sandbox \
--bootstrap-actions '[{"Path":"s3://scispark-bootstrap-scripts/emr-bootstrap-no-jplsec.sh",
"Name":"Custom action"}]' \
--ec2-attributes "{\"KeyName\":\"scispark\",
\"InstanceProfile\":\"EMR_EC2_DefaultRole\",
\"SubnetId\":\"$SUBNET_ID\",
\"EmrManagedMasterSecurityGroup\":\"$MASTER_SG_ID\",
\"EmrManagedSlaveSecurityGroup\":\"$SLAVE_SG_ID\",
\"AdditionalMasterSecurityGroups\":[\"$JPL_SG_ID\", \"$DEV_TEAM_SG_ID\"],
\"AdditionalSlaveSecurityGroups\":[\"$JPL_SG_ID\", \"$DEV_TEAM_SG_ID\"]}" \
--service-role EMR_DefaultRole \
--enable-debugging \
--release-label emr-4.4.0 \
--log-uri $LOG_URI \
--steps "[{\"Args\":[\"/usr/bin/hdfs\",\"dfs\",\"-get\",
\"s3://scispark-test-code/SciSparkTestExperiments.jar\",
\"/mnt/\"],
\"Type\":\"CUSTOM_JAR\",
\"ActionOnFailure\":\"$STEP_FAILURE_ACTION\",
\"Jar\":\"s3://elasticmapreduce/libs/script-runner/script-runner.jar\",
\"Properties\":\"\",
\"Name\":\"Custom JAR\"},
{\"Name\":\"S3DistCp step\",
\"Args\":[\"s3-dist-cp\",\"--s3Endpoint=s3.amazonaws.com\",
\"--src=s3://scispark-test-data/48hrs/\",
\"--dest=hdfs:///mnt/48hrs\"],
\"ActionOnFailure\":\"CONTINUE\",
\"Type\":\"CUSTOM_JAR\",
\"Jar\":\"command-runner.jar\"
},
{\"Args\":[\"spark-submit\",
\"--master\", \"yarn\",
\"--deploy-mode\", \"client\",
\"--class\", \"org.dia.algorithms.mcc.MainNetcdfDFSMCC\",
\"/mnt/SciSparkTestExperiments.jar\",
\"yarn-client\",
\"2\",
\"20\",
\"ch4\",
\"/mnt/48hrs\"],
\"Type\":\"CUSTOM_JAR\",
\"ActionOnFailure\":\"$STEP_FAILURE_ACTION\",
\"Jar\":\"command-runner.jar\",
\"Properties\":\"\",
\"Name\":\"Spark application\"}]" \
--name 'My SciSpark cluster - SciSparkTestExperiments' \
--instance-groups "[{\"InstanceCount\":1,
\"BidPrice\":\".266\",
\"InstanceGroupType\":\"MASTER\",
\"InstanceType\":\"m3.xlarge\",
\"Name\":\"Master instance group - 1\"},
{\"InstanceCount\":$WORKERS,
\"BidPrice\":\".266\",
\"InstanceGroupType\":\"CORE\",
\"InstanceType\":\"m3.xlarge\",
\"Name\":\"Core instance group - $WORKERS\"}]" \
--region us-east-1
)
# check error; if none get cluster_id
if [ $? -ne 0 ]; then
echo "Failed to create cluster." 1>&2
exit $STATUS
fi
cluster_id=`echo $json | grep '"ClusterId":' | cut -d'"' -f4`
# wait for it to run
date
echo -n "Waiting for cluster $cluster_id to run ... "
aws emr wait cluster-running --cluster-id $cluster_id
echo "done."
date