diff --git a/qa/L0_backend_python/model_readiness/test.sh b/qa/L0_backend_python/model_readiness/test.sh new file mode 100755 index 0000000000..cc87aeacd8 --- /dev/null +++ b/qa/L0_backend_python/model_readiness/test.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +TEST_RESULT_FILE='test_results.txt' +source ../common.sh +source ../../common/util.sh + +SERVER_ARGS="--model-repository=${MODELDIR}/model_readiness/models --backend-directory=${BACKEND_DIR} --log-verbose=1" + +RET=0 +rm -fr *.log ./models + +MODEL_NAME="identity_fp32" +mkdir -p models/$MODEL_NAME/1/ +cp ../../python_models/$MODEL_NAME/model.py ./models/$MODEL_NAME/1/model.py +cp ../../python_models/$MODEL_NAME/config.pbtxt ./models/$MODEL_NAME/config.pbtxt + +# +# Test Model Readiness (TRITONBACKEND_ModelInstanceReady) +# Test with different signals to simulate various crash/exit scenarios +# 11 (SIGSEGV) - Segmentation fault / crash +# 9 (SIGKILL) - Force kill +for SIGNAL in 11 9; do + echo -e "\n***\n*** Testing model_readiness with Signal $SIGNAL\n***" + SERVER_LOG="./model_readiness_signal_${SIGNAL}_server.log" + CLIENT_LOG="./model_readiness_${SIGNAL}_client.log" + + run_server + if [ "$SERVER_PID" == "0" ]; then + cat $SERVER_LOG + echo -e "\n***\n*** Failed to start $SERVER\n***" + exit 1 + fi + + set +e + + # Verify model is initially ready + echo "Checking Initial Readiness..." + python3 -m unittest test_model_readiness.TestModelReadiness.test_model_ready >> ${CLIENT_LOG} 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Initial readiness check failed \n***" + RET=1 + kill_server + exit 1 + fi + + # Find the stub process PID + stub_pid=$(pgrep -f "triton_python_backend_stub") + + if [ -z "$stub_pid" ]; then + echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Could not find stub process \n***" + RET=1 + kill_server + else + echo "Found stub process: $stub_pid" + + # Kill the stub process + echo "Killing stub with signal $SIGNAL..." + kill -$SIGNAL $stub_pid + sleep 1 + + # Verify model is now NOT ready + echo "Checking Not Ready Status..." + python3 -m unittest test_model_readiness.TestModelReadiness.test_model_not_ready >> ${CLIENT_LOG} 2>&1 + if [ $? -ne 0 ]; then + echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Model reported ready after kill \n***" + RET=1 + else + # Verify correct error message in logs + # Expect 2 occurrences: HTTP and gRPC checks + error_count=$(grep -c "Model '${MODEL_NAME}' version 1 is not ready: Stub process '${MODEL_NAME}_0_0' is not healthy." $SERVER_LOG) + if [ "$error_count" -eq 2 ]; then + echo -e "\n***\n Test model_readiness Passed for Signal $SIGNAL \n***" + else + echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Expected 2 error messages, found $error_count \n***" + cat $SERVER_LOG + RET=1 + fi + fi + fi + + set -e + kill_server +done + +if [ $RET -eq 0 ]; then + echo -e "\n***\n*** Test Passed\n***" +else + echo -e "\n***\n*** Test FAILED\n***" +fi + +exit $RET + diff --git a/qa/L0_backend_python/model_readiness/test_model_readiness.py b/qa/L0_backend_python/model_readiness/test_model_readiness.py new file mode 100644 index 0000000000..65d1c81d8a --- /dev/null +++ b/qa/L0_backend_python/model_readiness/test_model_readiness.py @@ -0,0 +1,87 @@ +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import unittest + +import tritonclient.grpc as grpcclient +import tritonclient.http as httpclient + + +class TestModelReadiness(unittest.TestCase): + def setUp(self): + self.model_name = "identity_fp32" + self.url_http = "localhost:8000" + self.url_grpc = "localhost:8001" + self.client_http = httpclient.InferenceServerClient(url=self.url_http) + self.client_grpc = grpcclient.InferenceServerClient(url=self.url_grpc) + + def test_model_ready(self): + print(f"\nTesting if model '{self.model_name}' is READY ...") + + # Check HTTP + try: + is_ready = self.client_http.is_model_ready(self.model_name) + self.assertTrue( + is_ready, f"[HTTP] Model {self.model_name} should be READY but is NOT" + ) + except Exception as e: + self.fail(f"[HTTP] Unexpected error: {str(e)}") + + # Check gRPC + try: + is_ready = self.client_grpc.is_model_ready(self.model_name) + self.assertTrue( + is_ready, f"[gRPC] Model {self.model_name} should be READY but is NOT" + ) + except Exception as e: + self.fail(f"[gRPC] Unexpected error: {str(e)}") + + def test_model_not_ready(self): + print(f"\nTesting if model '{self.model_name}' is NOT READY ...") + + # Check HTTP + try: + is_ready = self.client_http.is_model_ready(self.model_name) + self.assertFalse( + is_ready, + f"[HTTP] Model {self.model_name} should be NOT READY but is READY", + ) + except Exception as e: + self.fail(f"[HTTP] Unexpected error: {str(e)}") + + # Check gRPC + try: + is_ready = self.client_grpc.is_model_ready(self.model_name) + self.assertFalse( + is_ready, + f"[gRPC] Model {self.model_name} should be NOT READY but is READY.", + ) + except Exception as e: + self.fail(f"[gRPC] Unexpected error: {str(e)}") + + +if __name__ == "__main__": + unittest.main() diff --git a/qa/L0_backend_python/test.sh b/qa/L0_backend_python/test.sh index 58b92590e1..73e825ebf2 100755 --- a/qa/L0_backend_python/test.sh +++ b/qa/L0_backend_python/test.sh @@ -522,7 +522,7 @@ SUBTESTS="lifecycle argument_validation logging custom_metrics parameters" # [DLIS-6123] Disable examples test for Windows since it requires updates to the example clients if [[ ${TEST_WINDOWS} == 0 ]]; then # TODO: Reimplement restart on decoupled data pipeline and enable restart. - SUBTESTS+=" model_control examples request_rescheduling" + SUBTESTS+=" model_control examples request_rescheduling model_readiness" fi for TEST in ${SUBTESTS}; do # Run each subtest in a separate virtual environment to avoid conflicts