1
- # This is a sample Python program that trains a simple scikit-learn model on the Iris dataset.
1
+ # This is a sample Python program that trains a simple scikit-learn model on the California dataset.
2
2
# This implementation will work on your *local computer* or in the *AWS Cloud*.
3
3
#
4
4
# Prerequisites:
16
16
import os
17
17
18
18
from sagemaker .sklearn import SKLearn
19
+ import sagemaker
20
+ import boto3
19
21
from sklearn import datasets
22
+ from sklearn .model_selection import train_test_split
23
+ from sklearn .metrics import mean_squared_error
20
24
21
- DUMMY_IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'
25
+ local_mode = True
26
+
27
+ if local_mode :
28
+ instance_type = "local"
29
+ IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'
30
+ else :
31
+ instance_type = "ml.m5.xlarge"
32
+ IAM_ROLE = 'arn:aws:iam::<ACCOUNT>:role/service-role/AmazonSageMaker-ExecutionRole-XXX'
33
+
34
+ sess = sagemaker .Session ()
35
+ bucket = sess .default_bucket () # Set a default S3 bucket
36
+ prefix = 'DEMO-local-and-managed-infrastructure'
22
37
23
38
def download_training_and_eval_data ():
24
- if os .path .isfile ('./data/iris.csv' ):
25
- print ('Training and dataset exist. Skipping Download' )
26
- else :
27
- print ('Downloading training dataset' )
39
+ print ('Downloading training dataset' )
28
40
29
- # Load Iris dataset, then join labels and features
30
- iris = datasets .load_iris ()
31
- joined_iris = np .insert (iris .data , 0 , iris .target , axis = 1 )
41
+ # Load California Housing dataset, then join labels and features
42
+ california = datasets .fetch_california_housing ()
43
+ dataset = np .insert (california .data , 0 , california .target , axis = 1 )
44
+ # Create directory and write csv
45
+ os .makedirs ("./data/train" , exist_ok = True )
46
+ os .makedirs ("./data/validation" , exist_ok = True )
47
+ os .makedirs ("./data/test" , exist_ok = True )
32
48
33
- # Create directory and write csv
34
- os .makedirs ("./data" , exist_ok = True )
35
- np .savetxt ("./data/iris.csv" , joined_iris , delimiter = "," , fmt = "%1.1f, %1.3f, %1.3f, %1.3f, %1.3f" )
49
+ train , other = train_test_split (dataset , test_size = 0.3 )
50
+ validation , test = train_test_split (other , test_size = 0.5 )
36
51
37
- print ('Downloading completed' )
52
+ np .savetxt ("./data/train/california_train.csv" , train , delimiter = "," )
53
+ np .savetxt ("./data/validation/california_validation.csv" , validation , delimiter = "," )
54
+ np .savetxt ("./data/test/california_test.csv" , test , delimiter = "," )
55
+
56
+ print ('Downloading completed' )
38
57
39
58
def do_inference_on_local_endpoint (predictor ):
40
59
print (f'\n Starting Inference on endpoint (local).' )
41
- shape = pd .read_csv ("data/iris.csv" , header = None )
42
-
43
- a = [50 * i for i in range (3 )]
44
- b = [40 + i for i in range (10 )]
45
- indices = [i + j for i , j in itertools .product (a , b )]
46
-
47
- test_data = shape .iloc [indices [:- 1 ]]
60
+ test_data = pd .read_csv ("data/test/california_test.csv" , header = None )
48
61
test_X = test_data .iloc [:, 1 :]
49
62
test_y = test_data .iloc [:, 0 ]
50
- print ("Predictions: {}" .format (predictor .predict (test_X .values )))
63
+ predictions = predictor .predict (test_X .values )
64
+ print ("Predictions: {}" .format (predictions ))
51
65
print ("Actual: {}" .format (test_y .values ))
52
-
66
+ print ( f"RMSE: { mean_squared_error ( predictions , test_y . values ) } " )
53
67
54
68
def main ():
55
69
download_training_and_eval_data ()
@@ -58,21 +72,35 @@ def main():
58
72
print ('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' )
59
73
60
74
sklearn = SKLearn (
61
- entry_point = "scikit_learn_iris .py" ,
75
+ entry_point = "scikit_learn_california .py" ,
62
76
source_dir = 'code' ,
63
77
framework_version = "1.0-1" ,
64
- instance_type = "local" ,
65
- role = DUMMY_IAM_ROLE ,
78
+ instance_type = instance_type ,
79
+ role = IAM_ROLE ,
66
80
hyperparameters = {"max_leaf_nodes" : 30 },
67
81
)
68
82
69
- train_input = "file://./data/iris.csv"
83
+ if local_mode :
84
+ train_input = "file://./data/train/california_train.csv"
85
+ validation_input = "file://./data/validation/california_validation.csv"
86
+ else :
87
+ # upload data to S3
88
+ boto3 .Session ().resource ('s3' ).Bucket (bucket ).Object (os .path .join (prefix , 'data/train/california_train.csv' )).upload_file ('data/train/california_train.csv' )
89
+ boto3 .Session ().resource ('s3' ).Bucket (bucket ).Object (os .path .join (prefix , 'data/validation/california_validation.csv' )).upload_file ('data/validation/california_validation.csv' )
90
+ boto3 .Session ().resource ('s3' ).Bucket (bucket ).Object (os .path .join (prefix , 'data/test/california_test.csv' )).upload_file ('data/test/california_test.csv' )
70
91
71
- sklearn .fit ({"train" : train_input })
92
+ train_input = f"s3://{ bucket } /{ prefix } /data/train/california_train.csv"
93
+ validation_input = f"s3://{ bucket } /{ prefix } /data/validation/california_validation.csv"
94
+ test_input = f"s3://{ bucket } /{ prefix } /data/test/california_test.csv"
95
+
96
+ sklearn .fit ({"train" : train_input , "validation" : validation_input })
72
97
print ('Completed model training' )
73
98
74
- print ('Deploying endpoint in local mode' )
75
- predictor = sklearn .deploy (initial_instance_count = 1 , instance_type = 'local' )
99
+ if local_mode :
100
+ print ('Deploying endpoint in local mode' )
101
+ else :
102
+ print (f"deploying on the SageMaker managed infrastructure using a { instance_type } instance type" )
103
+ predictor = sklearn .deploy (initial_instance_count = 1 , instance_type = instance_type )
76
104
77
105
do_inference_on_local_endpoint (predictor )
78
106
0 commit comments