aws · saimidu · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021
diff --git a/docker/1.15/Dockerfile.cpu b/docker/1.15/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM public.ecr.aws/ubuntu/ubuntu:18.04
 
 LABEL maintainer="Amazon AI"
 # Specify LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT

diff --git a/docker/1.15/Dockerfile.eia b/docker/1.15/Dockerfile.eia
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM public.ecr.aws/ubuntu/ubuntu:18.04
 
 LABEL maintainer="Amazon AI"
 # Specify LABEL for inference pipelines to use SAGEMAKER_BIND_TO_PORT

diff --git a/docker/2.1/Dockerfile.cpu b/docker/2.1/Dockerfile.cpu
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM public.ecr.aws/ubuntu/ubuntu:18.04
 
 LABEL maintainer="Amazon AI"
 LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true

diff --git a/docker/build_artifacts/sagemaker/multi_model_utils.py b/docker/build_artifacts/sagemaker/multi_model_utils.py
@@ -21,7 +21,7 @@
 
 @contextmanager
 def lock(path=DEFAULT_LOCK_FILE):
-    f = open(path, "w")
+    f = open(path, "w", encoding="utf-8")
     fd = f.fileno()
     fcntl.lockf(fd, fcntl.LOCK_EX)
 

diff --git a/docker/build_artifacts/sagemaker/python_service.py b/docker/build_artifacts/sagemaker/python_service.py
@@ -150,7 +150,7 @@ def _handle_load_model_post(self, res, data):  # noqa: C901
                 tfs_config_file = "/sagemaker/tfs-config/{}/model-config.cfg".format(model_name)
                 log.info("tensorflow serving model config: \n%s\n", tfs_config)
                 os.makedirs(os.path.dirname(tfs_config_file))
-                with open(tfs_config_file, "w") as f:
+                with open(tfs_config_file, "w", encoding="utf-8") as f:
                     f.write(tfs_config)
 
                 batching_config_file = "/sagemaker/batching/{}/batching-config.cfg".format(

diff --git a/docker/build_artifacts/sagemaker/serve.py b/docker/build_artifacts/sagemaker/serve.py
@@ -159,7 +159,7 @@ def _create_tfs_config(self):
 
         log.info("tensorflow serving model config: \n%s\n", config)
 
-        with open(self._tfs_config_path, "w") as f:
+        with open(self._tfs_config_path, "w", encoding="utf-8") as f:
             f.write(config)
 
     def _setup_gunicorn(self):
@@ -258,11 +258,11 @@ def _create_nginx_config(self):
         config = pattern.sub(lambda x: template_values[x.group(1)], template)
         log.info("nginx config: \n%s\n", config)
 
-        with open("/sagemaker/nginx.conf", "w") as f:
+        with open("/sagemaker/nginx.conf", "w", encoding="utf-8") as f:
             f.write(config)
 
     def _read_nginx_template(self):
-        with open("/sagemaker/nginx.conf.template", "r") as f:
+        with open("/sagemaker/nginx.conf.template", "r", encoding="utf-8") as f:
             template = f.read()
             if not template:
                 raise ValueError("failed to read nginx.conf.template")

diff --git a/docker/build_artifacts/sagemaker/tfs_utils.py b/docker/build_artifacts/sagemaker/tfs_utils.py
@@ -41,7 +41,7 @@ def parse_request(req, rest_port, grpc_port, default_model_name, model_name=None
     tfs_uri = make_tfs_uri(rest_port, tfs_attributes, default_model_name, model_name)
 
     if not model_name:
-        model_name = tfs_attributes.get("tfs-model-name")
+        model_name = tfs_attributes.get("tfs-model-name") or default_model_name
 
     context = Context(model_name,
                       tfs_attributes.get("tfs-model-version"),
@@ -220,7 +220,7 @@ def __init__(self, key, env_var, value, defaulted_message):
         config += "%s { value: %s }\n" % (batching_parameter.key, batching_parameter.value)
 
     log.info("batching config: \n%s\n", config)
-    with open(batching_config_file, "w") as f:
+    with open(batching_config_file, "w", encoding="utf-8") as f:
         f.write(config)
 
 

diff --git a/scripts/build-all.sh b/scripts/build-all.sh
@@ -6,8 +6,8 @@ set -euo pipefail
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-${DIR}/build.sh --version 1.14.0 --arch eia
+#${DIR}/build.sh --version 1.15.0 --arch eia
 ${DIR}/build.sh --version 1.15.0 --arch cpu
-${DIR}/build.sh --version 1.15.0 --arch gpu
+#${DIR}/build.sh --version 1.15.0 --arch gpu
 ${DIR}/build.sh --version 2.1.0 --arch cpu
-${DIR}/build.sh --version 2.1.0 --arch gpu
+#${DIR}/build.sh --version 2.1.0 --arch gpu
diff --git a/test/integration/sagemaker/test_tfs.py b/test/integration/sagemaker/test_tfs.py
@@ -10,15 +10,29 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import json
 import os
+import tarfile
 
+import boto3
 import pytest
+import sagemaker
+import urllib.request
 
 import util
 
+from packaging.version import Version
+
+from sagemaker.tensorflow import TensorFlowModel
+from sagemaker.utils import name_from_base
+
+from timeout import timeout_and_delete_endpoint
+
 NON_P3_REGIONS = ["ap-southeast-1", "ap-southeast-2", "ap-south-1",
                   "ca-central-1", "eu-central-1", "eu-west-2", "us-west-1"]
 
+RESOURCES_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "resources"))
+
 
 @pytest.fixture(params=os.environ["TEST_VERSIONS"].split(","))
 def version(request):
@@ -75,6 +89,28 @@ def python_model_with_lib(region, boto_session):
                                        "test/data/python-with-lib.tar.gz")
 
 
+@pytest.fixture(scope="session")
+def resnet_model_tar_path():
+    model_path = os.path.join(RESOURCES_PATH, "models", "resnet50_v1")
+    model_tar_path = os.path.join(model_path, "model.tar.gz")
+    if os.path.exists(model_tar_path):
+        os.remove(model_tar_path)
+    s3_resource = boto3.resource("s3")
+    models_bucket = s3_resource.Bucket("aws-dlc-sample-models")
+    model_s3_location = "tensorflow/resnet50_v1/model"
+    for obj in models_bucket.objects.filter(Prefix=model_s3_location):
+        local_file = os.path.join(model_path, "model", os.path.relpath(obj.key, model_s3_location))
+        if not os.path.isdir(os.path.dirname(local_file)):
+            os.makedirs(os.path.dirname(local_file))
+        if obj.key.endswith("/"):
+            continue
+        models_bucket.download_file(obj.key, local_file)
+    with tarfile.open(model_tar_path, "w:gz") as model_tar:
+        model_tar.add(os.path.join(model_path, "code"), arcname="code")
+        model_tar.add(os.path.join(model_path, "model"), arcname="model")
+    return model_tar_path
+
+
 def test_tfs_model(boto_session, sagemaker_client,
                    sagemaker_runtime_client, model_name, tfs_model,
                    image_uri, instance_type, accelerator_type):
@@ -135,3 +171,42 @@ def test_python_model_with_lib(boto_session, sagemaker_client,
     # python service adds this to tfs response
     assert output_data["python"] is True
     assert output_data["dummy_module"] == "0.1"
+
+
+def test_resnet_with_inference_handler(
+    boto_session, image_uri, instance_type, resnet_model_tar_path, version
+):
+    if Version(version) >= Version("2.6"):
+        pytest.skip("The inference script currently uses v1 compat features, making it incompatible with TF>=2.6")
+
+    sagemaker_session = sagemaker.Session(boto_session=boto_session)
+    model_data = sagemaker_session.upload_data(
+        path=resnet_model_tar_path, key_prefix=os.path.join("tensorflow-inference", "resnet")
+    )
+    endpoint_name = name_from_base("tensorflow-inference")
+
+    tensorflow_model = TensorFlowModel(
+        model_data=model_data,
+        role="SageMakerRole",
+        entry_point="inference.py",
+        image_uri=image_uri,
+        sagemaker_session=sagemaker_session,
+    )
+
+    with timeout_and_delete_endpoint(endpoint_name, sagemaker_session, minutes=30):
+        tensorflow_predictor = tensorflow_model.deploy(
+            initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name
+        )
+        kitten_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg"
+        kitten_local_path = "kitten.jpg"
+        urllib.request.urlretrieve(kitten_url, kitten_local_path)
+        with open(kitten_local_path, "rb") as f:
+            kitten_image = f.read()
+
+        runtime_client = sagemaker_session.sagemaker_runtime_client
+        response = runtime_client.invoke_endpoint(
+            EndpointName=endpoint_name, ContentType='application/x-image', Body=kitten_image
+        )
+        result = json.loads(response['Body'].read().decode('ascii'))
+
+        assert len(result["outputs"]["probabilities"]["floatVal"]) == 3
diff --git a/test/integration/sagemaker/timeout.py b/test/integration/sagemaker/timeout.py
@@ -0,0 +1,83 @@
+# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+# TODO: this is used in all containers and sdk. We should move it to container support or sdk test utils.
+from __future__ import absolute_import
+import signal
+from contextlib import contextmanager
+import logging
+
+from botocore.exceptions import ClientError
+
+LOGGER = logging.getLogger('timeout')
+
+
+class TimeoutError(Exception):
+    pass
+
+
+@contextmanager
+def timeout(seconds=0, minutes=0, hours=0):
+    """Add a signal-based timeout to any block of code.
+    If multiple time units are specified, they will be added together to determine time limit.
+    Usage:
+    with timeout(seconds=5):
+        my_slow_function(...)
+    Args:
+        - seconds: The time limit, in seconds.
+        - minutes: The time limit, in minutes.
+        - hours: The time limit, in hours.
+    """
+
+    limit = seconds + 60 * minutes + 3600 * hours
+
+    def handler(signum, frame):
+        raise TimeoutError('timed out after {} seconds'.format(limit))
+
+    try:
+        signal.signal(signal.SIGALRM, handler)
+        signal.alarm(limit)
+
+        yield
+    finally:
+        signal.alarm(0)
+
+
+@contextmanager
+def timeout_and_delete_endpoint(endpoint_name, sagemaker_session,
+                                seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            try:
+                sagemaker_session.delete_endpoint(endpoint_name)
+                LOGGER.info("deleted endpoint {}".format(endpoint_name))
+            except ClientError as ce:
+                if ce.response['Error']['Code'] == 'ValidationException':
+                    # avoids the inner exception to be overwritten
+                    pass
+
+
+@contextmanager
+def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0):
+    with timeout(seconds=seconds, minutes=minutes, hours=hours) as t:
+        try:
+            yield [t]
+        finally:
+            try:
+                sagemaker_session.delete_endpoint(endpoint_name)
+                LOGGER.info('deleted endpoint {}'.format(endpoint_name))
+            except ClientError as ce:
+                if ce.response['Error']['Code'] == 'ValidationException':
+                    # avoids the inner exception to be overwritten
+                    pass
diff --git a/test/resources/models/resnet50_v1/code/inference.py b/test/resources/models/resnet50_v1/code/inference.py
@@ -0,0 +1,44 @@
+import io
+
+import grpc
+import gzip
+import numpy as np
+import tensorflow as tf
+
+from google.protobuf.json_format import MessageToJson
+from PIL import Image
+from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
+
+prediction_services = {}
+compression_algo = gzip
+
+
+def handler(data, context):
+    f = data.read()
+    f = io.BytesIO(f)
+    image = Image.open(f).convert('RGB')
+    batch_size = 1
+    image = np.asarray(image.resize((224, 224)))
+    image = np.concatenate([image[np.newaxis, :, :]] * batch_size)
+
+    request = predict_pb2.PredictRequest()
+    request.model_spec.name = context.model_name
+    request.model_spec.signature_name = 'serving_default'
+    request.inputs['images'].CopyFrom(
+        tf.compat.v1.make_tensor_proto(image, shape=image.shape, dtype=tf.float32))
+
+    # Call Predict gRPC service
+    result = get_prediction_service(context).Predict(request, 60.0)
+    print("Returning the response for grpc port: {}".format(context.grpc_port))
+
+    # Return response
+    json_obj = MessageToJson(result)
+    return json_obj, "application/json"
+
+
+def get_prediction_service(context):
+    # global prediction_service
+    if context.grpc_port not in prediction_services:
+        channel = grpc.insecure_channel("localhost:{}".format(context.grpc_port))
+        prediction_services[context.grpc_port] = prediction_service_pb2_grpc.PredictionServiceStub(channel)
+    return prediction_services[context.grpc_port]
diff --git a/test/resources/models/resnet50_v1/code/requirements.txt b/test/resources/models/resnet50_v1/code/requirements.txt
@@ -0,0 +1,3 @@
+numpy
+Pillow
+tensorflow==1.15.5
diff --git a/tox.ini b/tox.ini
@@ -6,7 +6,7 @@
 [tox]
 skipsdist = True
 skip_missing_interpreters = False
-envlist = jshint,flake8,pylint,py36,py37
+envlist = jshint,flake8,pylint,py36
 
 [flake8]
 max-line-length = 100
@@ -60,7 +60,10 @@ deps =
     pytest
     pytest-xdist
     boto3
+    packaging
     requests
+    sagemaker
+    urllib3
 
 [testenv:flake8]
 deps =