From 7387dc5f946c723aeea1d1e8b36b7734bc5bfec7 Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 7 Dec 2021 07:40:30 -0800 Subject: [PATCH 1/6] fix: Use default model name when model name is None --- docker/build_artifacts/sagemaker/tfs_utils.py | 2 +- test/integration/sagemaker/test_tfs.py | 76 +++++++++++++++++ test/integration/sagemaker/timeout.py | 83 +++++++++++++++++++ .../models/resnet50_v1/code/inference.py | 44 ++++++++++ .../models/resnet50_v1/code/requirements.txt | 3 + tox.ini | 3 + 6 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 test/integration/sagemaker/timeout.py create mode 100644 test/resources/models/resnet50_v1/code/inference.py create mode 100644 test/resources/models/resnet50_v1/code/requirements.txt diff --git a/docker/build_artifacts/sagemaker/tfs_utils.py b/docker/build_artifacts/sagemaker/tfs_utils.py index d2d9f07a..6cfb48d6 100644 --- a/docker/build_artifacts/sagemaker/tfs_utils.py +++ b/docker/build_artifacts/sagemaker/tfs_utils.py @@ -41,7 +41,7 @@ def parse_request(req, rest_port, grpc_port, default_model_name, model_name=None tfs_uri = make_tfs_uri(rest_port, tfs_attributes, default_model_name, model_name) if not model_name: - model_name = tfs_attributes.get("tfs-model-name") + model_name = tfs_attributes.get("tfs-model-name") or default_model_name context = Context(model_name, tfs_attributes.get("tfs-model-version"), diff --git a/test/integration/sagemaker/test_tfs.py b/test/integration/sagemaker/test_tfs.py index f73ce35c..07544dc0 100644 --- a/test/integration/sagemaker/test_tfs.py +++ b/test/integration/sagemaker/test_tfs.py @@ -10,15 +10,29 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. +import json import os +import tarfile +import boto3 import pytest +import sagemaker +import urllib.request import util +from packaging.version import Version + +from sagemaker.tensorflow import TensorFlowModel +from sagemaker.utils import name_from_base + +from timeout import timeout_and_delete_endpoint + NON_P3_REGIONS = ["ap-southeast-1", "ap-southeast-2", "ap-south-1", "ca-central-1", "eu-central-1", "eu-west-2", "us-west-1"] +RESOURCES_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources")) + @pytest.fixture(params=os.environ["TEST_VERSIONS"].split(",")) def version(request): @@ -75,6 +89,28 @@ def python_model_with_lib(region, boto_session): "test/data/python-with-lib.tar.gz") +@pytest.fixture(scope="session") +def resnet_model_tar_path(): + model_path = os.path.join(RESOURCES_PATH, "models", "resnet50_v1") + model_tar_path = os.path.join(model_path, "model.tar.gz") + if os.path.exists(model_tar_path): + os.remove(model_tar_path) + s3_resource = boto3.resource("s3") + models_bucket = s3_resource.Bucket("aws-dlc-sample-models") + model_s3_location = "tensorflow/resnet50_v1/model" + for obj in models_bucket.objects.filter(Prefix=model_s3_location): + local_file = os.path.join(model_path, "model", os.path.relpath(obj.key, model_s3_location)) + if not os.path.isdir(os.path.dirname(local_file)): + os.makedirs(os.path.dirname(local_file)) + if obj.key.endswith("/"): + continue + models_bucket.download_file(obj.key, local_file) + with tarfile.open(model_tar_path, "w:gz") as model_tar: + model_tar.add(os.path.join(model_path, "code"), arcname="code") + model_tar.add(os.path.join(model_path, "model"), arcname="model") + return model_tar_path + + def test_tfs_model(boto_session, sagemaker_client, sagemaker_runtime_client, model_name, tfs_model, image_uri, instance_type, accelerator_type): @@ -135,3 +171,43 @@ def test_python_model_with_lib(boto_session, sagemaker_client, # python service adds this to tfs response assert output_data["python"] is True assert output_data["dummy_module"] == "0.1" + + +def test_resnet_with_inference_handler( + boto_session, image_uri, instance_type, resnet_model_tar_path, framework_version +): + if Version(framework_version) >= Version("2.6"): + pytest.skip( + "The inference script based test currently uses v1 compat features, making it incompatible with TF>=2.6" + ) + sagemaker_session = sagemaker.Session(boto_session=boto_session) + model_data = sagemaker_session.upload_data( + path=resnet_model_tar_path, key_prefix=os.path.join("tensorflow-inference", "resnet") + ) + endpoint_name = name_from_base("tensorflow-inference") + + tensorflow_model = TensorFlowModel( + model_data=model_data, + role="SageMakerRole", + entry_point="inference.py", + image_uri=image_uri, + sagemaker_session=sagemaker_session, + ) + + with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=30): + tensorflow_predictor = tensorflow_model.deploy( + initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name + ) + kitten_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" + kitten_local_path = "kitten.jpg" + urllib.request.urlretrieve(kitten_url, kitten_local_path) + with open(kitten_local_path, "rb") as f: + kitten_image = f.read() + + runtime_client = sagemaker_session.sagemaker_runtime_client + response = runtime_client.invoke_endpoint( + EndpointName=endpoint_name, ContentType='application/x-image', Body=kitten_image + ) + result = json.loads(response['Body'].read().decode('ascii')) + + assert len(result["outputs"]["probabilities"]["floatVal"]) == 3 diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py new file mode 100644 index 00000000..b9f032ac --- /dev/null +++ b/test/integration/sagemaker/timeout.py @@ -0,0 +1,83 @@ +# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +# TODO: this is used in all containers and sdk. We should move it to container support or sdk test utils. +from __future__ import absolute_import +import signal +from contextlib import contextmanager +import logging + +from botocore.exceptions import ClientError + +LOGGER = logging.getLogger('timeout') + + +class TimeoutError(Exception): + pass + + +@contextmanager +def timeout(seconds=0, minutes=0, hours=0): + """Add a signal-based timeout to any block of code. + If multiple time units are specified, they will be added together to determine time limit. + Usage: + with timeout(seconds=5): + my_slow_function(...) + Args: + - seconds: The time limit, in seconds. + - minutes: The time limit, in minutes. + - hours: The time limit, in hours. + """ + + limit = seconds + 60 * minutes + 3600 * hours + + def handler(signum, frame): + raise TimeoutError('timed out after {} seconds'.format(limit)) + + try: + signal.signal(signal.SIGALRM, handler) + signal.alarm(limit) + + yield + finally: + signal.alarm(0) + + +@contextmanager +def timeout_and_delete_endpoint(endpoint_name, sagemaker_session, + seconds=0, minutes=0, hours=0): + with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: + try: + yield [t] + finally: + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info("deleted endpoint {}".format(endpoint_name)) + except ClientError as ce: + if ce.response['Error']['Code'] == 'ValidationException': + # avoids the inner exception to be overwritten + pass + + +@contextmanager +def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0): + with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: + try: + yield [t] + finally: + try: + sagemaker_session.delete_endpoint(endpoint_name) + LOGGER.info('deleted endpoint {}'.format(endpoint_name)) + except ClientError as ce: + if ce.response['Error']['Code'] == 'ValidationException': + # avoids the inner exception to be overwritten + pass diff --git a/test/resources/models/resnet50_v1/code/inference.py b/test/resources/models/resnet50_v1/code/inference.py new file mode 100644 index 00000000..ddd80ccb --- /dev/null +++ b/test/resources/models/resnet50_v1/code/inference.py @@ -0,0 +1,44 @@ +import io + +import grpc +import gzip +import numpy as np +import tensorflow as tf + +from google.protobuf.json_format import MessageToJson +from PIL import Image +from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc + +prediction_services = {} +compression_algo = gzip + + +def handler(data, context): + f = data.read() + f = io.BytesIO(f) + image = Image.open(f).convert('RGB') + batch_size = 1 + image = np.asarray(image.resize((224, 224))) + image = np.concatenate([image[np.newaxis, :, :]] * batch_size) + + request = predict_pb2.PredictRequest() + request.model_spec.name = context.model_name + request.model_spec.signature_name = 'serving_default' + request.inputs['images'].CopyFrom( + tf.compat.v1.make_tensor_proto(image, shape=image.shape, dtype=tf.float32)) + + # Call Predict gRPC service + result = get_prediction_service(context).Predict(request, 60.0) + print("Returning the response for grpc port: {}".format(context.grpc_port)) + + # Return response + json_obj = MessageToJson(result) + return json_obj, "application/json" + + +def get_prediction_service(context): + # global prediction_service + if context.grpc_port not in prediction_services: + channel = grpc.insecure_channel("localhost:{}".format(context.grpc_port)) + prediction_services[context.grpc_port] = prediction_service_pb2_grpc.PredictionServiceStub(channel) + return prediction_services[context.grpc_port] diff --git a/test/resources/models/resnet50_v1/code/requirements.txt b/test/resources/models/resnet50_v1/code/requirements.txt new file mode 100644 index 00000000..29b61b11 --- /dev/null +++ b/test/resources/models/resnet50_v1/code/requirements.txt @@ -0,0 +1,3 @@ +numpy +Pillow +tensorflow==1.15.5 diff --git a/tox.ini b/tox.ini index 7be28d28..409c4171 100644 --- a/tox.ini +++ b/tox.ini @@ -60,7 +60,10 @@ deps = pytest pytest-xdist boto3 + packaging requests + sagemaker + urllib3 [testenv:flake8] deps = From c3b0c5636f2027271e36b838a3de872d67b2863a Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 7 Dec 2021 08:08:52 -0800 Subject: [PATCH 2/6] Try to fix linting test errors --- test/integration/sagemaker/test_tfs.py | 5 ++--- tox.ini | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/test/integration/sagemaker/test_tfs.py b/test/integration/sagemaker/test_tfs.py index 07544dc0..482a8cfa 100644 --- a/test/integration/sagemaker/test_tfs.py +++ b/test/integration/sagemaker/test_tfs.py @@ -177,9 +177,8 @@ def test_resnet_with_inference_handler( boto_session, image_uri, instance_type, resnet_model_tar_path, framework_version ): if Version(framework_version) >= Version("2.6"): - pytest.skip( - "The inference script based test currently uses v1 compat features, making it incompatible with TF>=2.6" - ) + pytest.skip("The inference script currently uses v1 compat features, making it incompatible with TF>=2.6") + sagemaker_session = sagemaker.Session(boto_session=boto_session) model_data = sagemaker_session.upload_data( path=resnet_model_tar_path, key_prefix=os.path.join("tensorflow-inference", "resnet") diff --git a/tox.ini b/tox.ini index 409c4171..f0d497a2 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,7 @@ [tox] skipsdist = True skip_missing_interpreters = False -envlist = jshint,flake8,pylint,py36,py37 +envlist = jshint,flake8,pylint,py36 [flake8] max-line-length = 100 From b35ce3f820cee6a28a242945661ac6393b9dad5f Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 7 Dec 2021 08:34:56 -0800 Subject: [PATCH 3/6] Correct encoding in file read/writes --- docker/build_artifacts/sagemaker/multi_model_utils.py | 2 +- docker/build_artifacts/sagemaker/python_service.py | 2 +- docker/build_artifacts/sagemaker/serve.py | 6 +++--- docker/build_artifacts/sagemaker/tfs_utils.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/build_artifacts/sagemaker/multi_model_utils.py b/docker/build_artifacts/sagemaker/multi_model_utils.py index 5d2c47f4..dc3d19b3 100644 --- a/docker/build_artifacts/sagemaker/multi_model_utils.py +++ b/docker/build_artifacts/sagemaker/multi_model_utils.py @@ -21,7 +21,7 @@ @contextmanager def lock(path=DEFAULT_LOCK_FILE): - f = open(path, "w") + f = open(path, "w", encoding="utf-8") fd = f.fileno() fcntl.lockf(fd, fcntl.LOCK_EX) diff --git a/docker/build_artifacts/sagemaker/python_service.py b/docker/build_artifacts/sagemaker/python_service.py index e294e5fc..23af0b2c 100644 --- a/docker/build_artifacts/sagemaker/python_service.py +++ b/docker/build_artifacts/sagemaker/python_service.py @@ -150,7 +150,7 @@ def _handle_load_model_post(self, res, data): # noqa: C901 tfs_config_file = "/sagemaker/tfs-config/{}/model-config.cfg".format(model_name) log.info("tensorflow serving model config: \n%s\n", tfs_config) os.makedirs(os.path.dirname(tfs_config_file)) - with open(tfs_config_file, "w") as f: + with open(tfs_config_file, "w", encoding="utf-8") as f: f.write(tfs_config) batching_config_file = "/sagemaker/batching/{}/batching-config.cfg".format( diff --git a/docker/build_artifacts/sagemaker/serve.py b/docker/build_artifacts/sagemaker/serve.py index f8b87614..ab1062e1 100644 --- a/docker/build_artifacts/sagemaker/serve.py +++ b/docker/build_artifacts/sagemaker/serve.py @@ -159,7 +159,7 @@ def _create_tfs_config(self): log.info("tensorflow serving model config: \n%s\n", config) - with open(self._tfs_config_path, "w") as f: + with open(self._tfs_config_path, "w", encoding="utf-8") as f: f.write(config) def _setup_gunicorn(self): @@ -258,11 +258,11 @@ def _create_nginx_config(self): config = pattern.sub(lambda x: template_values[x.group(1)], template) log.info("nginx config: \n%s\n", config) - with open("/sagemaker/nginx.conf", "w") as f: + with open("/sagemaker/nginx.conf", "w", encoding="utf-8") as f: f.write(config) def _read_nginx_template(self): - with open("/sagemaker/nginx.conf.template", "r") as f: + with open("/sagemaker/nginx.conf.template", "r", encoding="utf-8") as f: template = f.read() if not template: raise ValueError("failed to read nginx.conf.template") diff --git a/docker/build_artifacts/sagemaker/tfs_utils.py b/docker/build_artifacts/sagemaker/tfs_utils.py index 6cfb48d6..991b3999 100644 --- a/docker/build_artifacts/sagemaker/tfs_utils.py +++ b/docker/build_artifacts/sagemaker/tfs_utils.py @@ -220,7 +220,7 @@ def __init__(self, key, env_var, value, defaulted_message): config += "%s { value: %s }\n" % (batching_parameter.key, batching_parameter.value) log.info("batching config: \n%s\n", config) - with open(batching_config_file, "w") as f: + with open(batching_config_file, "w", encoding="utf-8") as f: f.write(config) From 7bdfd11cd7eae00b35b20564407f1daa3c875869 Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 7 Dec 2021 09:59:39 -0800 Subject: [PATCH 4/6] Fix test fixture usage --- test/integration/sagemaker/test_tfs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/integration/sagemaker/test_tfs.py b/test/integration/sagemaker/test_tfs.py index 482a8cfa..5ad80ceb 100644 --- a/test/integration/sagemaker/test_tfs.py +++ b/test/integration/sagemaker/test_tfs.py @@ -174,9 +174,9 @@ def test_python_model_with_lib(boto_session, sagemaker_client, def test_resnet_with_inference_handler( - boto_session, image_uri, instance_type, resnet_model_tar_path, framework_version + boto_session, image_uri, instance_type, resnet_model_tar_path, version ): - if Version(framework_version) >= Version("2.6"): + if Version(version) >= Version("2.6"): pytest.skip("The inference script currently uses v1 compat features, making it incompatible with TF>=2.6") sagemaker_session = sagemaker.Session(boto_session=boto_session) From 2b75e86c6f6f53d72a91c4ceedcd17ede316b5f0 Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Wed, 8 Dec 2021 11:31:59 -0800 Subject: [PATCH 5/6] Change docker image sources --- docker/1.15/Dockerfile.cpu | 2 +- docker/1.15/Dockerfile.eia | 2 +- docker/2.1/Dockerfile.cpu | 2 +- scripts/build-all.sh | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/1.15/Dockerfile.cpu b/docker/1.15/Dockerfile.cpu index 0b304a75..abbfdb85 100644 --- a/docker/1.15/Dockerfile.cpu +++ b/docker/1.15/Dockerfile.cpu @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM public.ecr.aws/ubuntu/ubuntu:18.04 LABEL maintainer="Amazon AI" # Specify LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT diff --git a/docker/1.15/Dockerfile.eia b/docker/1.15/Dockerfile.eia index b1d753de..602a4978 100644 --- a/docker/1.15/Dockerfile.eia +++ b/docker/1.15/Dockerfile.eia @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM public.ecr.aws/ubuntu/ubuntu:18.04 LABEL maintainer="Amazon AI" # Specify LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT diff --git a/docker/2.1/Dockerfile.cpu b/docker/2.1/Dockerfile.cpu index b85c6e29..36b3c3f3 100644 --- a/docker/2.1/Dockerfile.cpu +++ b/docker/2.1/Dockerfile.cpu @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM public.ecr.aws/ubuntu/ubuntu:18.04 LABEL maintainer="Amazon AI" LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true diff --git a/scripts/build-all.sh b/scripts/build-all.sh index 5a32e3ab..2eb83a47 100755 --- a/scripts/build-all.sh +++ b/scripts/build-all.sh @@ -6,8 +6,8 @@ set -euo pipefail DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -${DIR}/build.sh --version 1.14.0 --arch eia +${DIR}/build.sh --version 1.15.0 --arch eia ${DIR}/build.sh --version 1.15.0 --arch cpu -${DIR}/build.sh --version 1.15.0 --arch gpu +#${DIR}/build.sh --version 1.15.0 --arch gpu ${DIR}/build.sh --version 2.1.0 --arch cpu -${DIR}/build.sh --version 2.1.0 --arch gpu +#${DIR}/build.sh --version 2.1.0 --arch gpu From 530925790880ad5c927844a806cd6ce0bb1a90ab Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Wed, 8 Dec 2021 12:05:09 -0800 Subject: [PATCH 6/6] Remove EIA --- scripts/build-all.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build-all.sh b/scripts/build-all.sh index 2eb83a47..18a6a107 100755 --- a/scripts/build-all.sh +++ b/scripts/build-all.sh @@ -6,7 +6,7 @@ set -euo pipefail DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -${DIR}/build.sh --version 1.15.0 --arch eia +#${DIR}/build.sh --version 1.15.0 --arch eia ${DIR}/build.sh --version 1.15.0 --arch cpu #${DIR}/build.sh --version 1.15.0 --arch gpu ${DIR}/build.sh --version 2.1.0 --arch cpu