Skip to content

Commit 97f2f46

Browse files
authored
ci: Enable detection of unresponsive or crashed Python backend stub process (#8552)
1 parent 08ffffe commit 97f2f46

File tree

3 files changed

+205
-1
lines changed

3 files changed

+205
-1
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/bin/bash
2+
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions
6+
# are met:
7+
# * Redistributions of source code must retain the above copyright
8+
# notice, this list of conditions and the following disclaimer.
9+
# * Redistributions in binary form must reproduce the above copyright
10+
# notice, this list of conditions and the following disclaimer in the
11+
# documentation and/or other materials provided with the distribution.
12+
# * Neither the name of NVIDIA CORPORATION nor the names of its
13+
# contributors may be used to endorse or promote products derived
14+
# from this software without specific prior written permission.
15+
#
16+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
17+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
20+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
24+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
28+
TEST_RESULT_FILE='test_results.txt'
29+
source ../common.sh
30+
source ../../common/util.sh
31+
32+
SERVER_ARGS="--model-repository=${MODELDIR}/model_readiness/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
33+
34+
RET=0
35+
rm -fr *.log ./models
36+
37+
MODEL_NAME="identity_fp32"
38+
mkdir -p models/$MODEL_NAME/1/
39+
cp ../../python_models/$MODEL_NAME/model.py ./models/$MODEL_NAME/1/model.py
40+
cp ../../python_models/$MODEL_NAME/config.pbtxt ./models/$MODEL_NAME/config.pbtxt
41+
42+
#
43+
# Test Model Readiness (TRITONBACKEND_ModelInstanceReady)
44+
# Test with different signals to simulate various crash/exit scenarios
45+
# 11 (SIGSEGV) - Segmentation fault / crash
46+
# 9 (SIGKILL) - Force kill
47+
for SIGNAL in 11 9; do
48+
echo -e "\n***\n*** Testing model_readiness with Signal $SIGNAL\n***"
49+
SERVER_LOG="./model_readiness_signal_${SIGNAL}_server.log"
50+
CLIENT_LOG="./model_readiness_${SIGNAL}_client.log"
51+
52+
run_server
53+
if [ "$SERVER_PID" == "0" ]; then
54+
cat $SERVER_LOG
55+
echo -e "\n***\n*** Failed to start $SERVER\n***"
56+
exit 1
57+
fi
58+
59+
set +e
60+
61+
# Verify model is initially ready
62+
echo "Checking Initial Readiness..."
63+
python3 -m unittest test_model_readiness.TestModelReadiness.test_model_ready >> ${CLIENT_LOG} 2>&1
64+
if [ $? -ne 0 ]; then
65+
echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Initial readiness check failed \n***"
66+
RET=1
67+
kill_server
68+
exit 1
69+
fi
70+
71+
# Find the stub process PID
72+
stub_pid=$(pgrep -f "triton_python_backend_stub")
73+
74+
if [ -z "$stub_pid" ]; then
75+
echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Could not find stub process \n***"
76+
RET=1
77+
kill_server
78+
else
79+
echo "Found stub process: $stub_pid"
80+
81+
# Kill the stub process
82+
echo "Killing stub with signal $SIGNAL..."
83+
kill -$SIGNAL $stub_pid
84+
sleep 1
85+
86+
# Verify model is now NOT ready
87+
echo "Checking Not Ready Status..."
88+
python3 -m unittest test_model_readiness.TestModelReadiness.test_model_not_ready >> ${CLIENT_LOG} 2>&1
89+
if [ $? -ne 0 ]; then
90+
echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Model reported ready after kill \n***"
91+
RET=1
92+
else
93+
# Verify correct error message in logs
94+
# Expect 2 occurrences: HTTP and gRPC checks
95+
error_count=$(grep -c "Model '${MODEL_NAME}' version 1 is not ready: Stub process '${MODEL_NAME}_0_0' is not healthy." $SERVER_LOG)
96+
if [ "$error_count" -eq 2 ]; then
97+
echo -e "\n***\n Test model_readiness Passed for Signal $SIGNAL \n***"
98+
else
99+
echo -e "\n***\n*** Test model_readiness Failed (Signal $SIGNAL): Expected 2 error messages, found $error_count \n***"
100+
cat $SERVER_LOG
101+
RET=1
102+
fi
103+
fi
104+
fi
105+
106+
set -e
107+
kill_server
108+
done
109+
110+
if [ $RET -eq 0 ]; then
111+
echo -e "\n***\n*** Test Passed\n***"
112+
else
113+
echo -e "\n***\n*** Test FAILED\n***"
114+
fi
115+
116+
exit $RET
117+
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
#
3+
# Redistribution and use in source and binary forms, with or without
4+
# modification, are permitted provided that the following conditions
5+
# are met:
6+
# * Redistributions of source code must retain the above copyright
7+
# notice, this list of conditions and the following disclaimer.
8+
# * Redistributions in binary form must reproduce the above copyright
9+
# notice, this list of conditions and the following disclaimer in the
10+
# documentation and/or other materials provided with the distribution.
11+
# * Neither the name of NVIDIA CORPORATION nor the names of its
12+
# contributors may be used to endorse or promote products derived
13+
# from this software without specific prior written permission.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22+
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23+
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
27+
import unittest
28+
29+
import tritonclient.grpc as grpcclient
30+
import tritonclient.http as httpclient
31+
32+
33+
class TestModelReadiness(unittest.TestCase):
34+
def setUp(self):
35+
self.model_name = "identity_fp32"
36+
self.url_http = "localhost:8000"
37+
self.url_grpc = "localhost:8001"
38+
self.client_http = httpclient.InferenceServerClient(url=self.url_http)
39+
self.client_grpc = grpcclient.InferenceServerClient(url=self.url_grpc)
40+
41+
def test_model_ready(self):
42+
print(f"\nTesting if model '{self.model_name}' is READY ...")
43+
44+
# Check HTTP
45+
try:
46+
is_ready = self.client_http.is_model_ready(self.model_name)
47+
self.assertTrue(
48+
is_ready, f"[HTTP] Model {self.model_name} should be READY but is NOT"
49+
)
50+
except Exception as e:
51+
self.fail(f"[HTTP] Unexpected error: {str(e)}")
52+
53+
# Check gRPC
54+
try:
55+
is_ready = self.client_grpc.is_model_ready(self.model_name)
56+
self.assertTrue(
57+
is_ready, f"[gRPC] Model {self.model_name} should be READY but is NOT"
58+
)
59+
except Exception as e:
60+
self.fail(f"[gRPC] Unexpected error: {str(e)}")
61+
62+
def test_model_not_ready(self):
63+
print(f"\nTesting if model '{self.model_name}' is NOT READY ...")
64+
65+
# Check HTTP
66+
try:
67+
is_ready = self.client_http.is_model_ready(self.model_name)
68+
self.assertFalse(
69+
is_ready,
70+
f"[HTTP] Model {self.model_name} should be NOT READY but is READY",
71+
)
72+
except Exception as e:
73+
self.fail(f"[HTTP] Unexpected error: {str(e)}")
74+
75+
# Check gRPC
76+
try:
77+
is_ready = self.client_grpc.is_model_ready(self.model_name)
78+
self.assertFalse(
79+
is_ready,
80+
f"[gRPC] Model {self.model_name} should be NOT READY but is READY.",
81+
)
82+
except Exception as e:
83+
self.fail(f"[gRPC] Unexpected error: {str(e)}")
84+
85+
86+
if __name__ == "__main__":
87+
unittest.main()

qa/L0_backend_python/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ SUBTESTS="lifecycle argument_validation logging custom_metrics parameters"
522522
# [DLIS-6123] Disable examples test for Windows since it requires updates to the example clients
523523
if [[ ${TEST_WINDOWS} == 0 ]]; then
524524
# TODO: Reimplement restart on decoupled data pipeline and enable restart.
525-
SUBTESTS+=" model_control examples request_rescheduling"
525+
SUBTESTS+=" model_control examples request_rescheduling model_readiness"
526526
fi
527527
for TEST in ${SUBTESTS}; do
528528
# Run each subtest in a separate virtual environment to avoid conflicts

0 commit comments

Comments
 (0)