Skip to content

Commit c461731

Browse files
committed
Add safe iterator wrapper to handle
UnicodeDecodeError in SSH output - Added _safe_iterator() method to gracefully handle invalid UTF-8 bytes in stdout/stderr - Method uses iter() to handle both lists (from tests) and iterators (from pssh library) - Skips malformed lines with warning message and continues processing - Includes comprehensive unit tests with actual non-UTF-8 bytes (b'\x96', b'\xff\xfe') - Fixes test failures caused by tqdm progress bars containing invalid UTF-8
1 parent 277697a commit c461731

6 files changed

Lines changed: 216 additions & 83 deletions

File tree

cvs/input/config_file/inference/vllm/mi355x_singlenode_vllm.json

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@
7777
"result_dict": {
7878
"ISL=1024,OSL=1024,TP=1,CONC=16": {
7979
"total_throughput_per_sec": "4651",
80-
"mean_ttft_ms": "200",
81-
"mean_tpot_ms": "10"
80+
"mean_ttft_ms": "70",
81+
"mean_tpot_ms": "8"
8282
},
8383
"ISL=1024,OSL=1024,TP=1,CONC=32": {
8484
"total_throughput_per_sec": "7043",
@@ -87,28 +87,28 @@
8787
},
8888
"ISL=1024,OSL=1024,TP=1,CONC=64": {
8989
"total_throughput_per_sec": "10677",
90-
"mean_ttft_ms": "150",
91-
"mean_tpot_ms": "8"
90+
"mean_ttft_ms": "76",
91+
"mean_tpot_ms": "13"
9292
},
9393
"ISL=1024,OSL=8192,TP=1,CONC=16": {
9494
"total_throughput_per_sec": "2735",
95-
"mean_ttft_ms": "250",
96-
"mean_tpot_ms": "15"
95+
"mean_ttft_ms": "57",
96+
"mean_tpot_ms": "7"
9797
},
9898
"ISL=1024,OSL=8192,TP=1,CONC=32": {
9999
"total_throughput_per_sec": "4038",
100-
"mean_ttft_ms": "230",
101-
"mean_tpot_ms": "13"
100+
"mean_ttft_ms": "67",
101+
"mean_tpot_ms": "10"
102102
},
103103
"ISL=1024,OSL=8192,TP=1,CONC=64": {
104104
"total_throughput_per_sec": "6140",
105-
"mean_ttft_ms": "210",
106-
"mean_tpot_ms": "11"
105+
"mean_ttft_ms": "93",
106+
"mean_tpot_ms": "13"
107107
},
108108
"ISL=8192,OSL=1024,TP=1,CONC=16": {
109109
"total_throughput_per_sec": "16509",
110-
"mean_ttft_ms": "350",
111-
"mean_tpot_ms": "20"
110+
"mean_ttft_ms": "335",
111+
"mean_tpot_ms": "24"
112112
},
113113
"ISL=8192,OSL=1024,TP=1,CONC=32": {
114114
"total_throughput_per_sec": "22072",
@@ -117,8 +117,8 @@
117117
},
118118
"ISL=8192,OSL=1024,TP=1,CONC=64": {
119119
"total_throughput_per_sec": "28863",
120-
"mean_ttft_ms": "300",
121-
"mean_tpot_ms": "19"
120+
"mean_ttft_ms": "280",
121+
"mean_tpot_ms": "22"
122122
}
123123
}
124124
},
@@ -174,13 +174,13 @@
174174
"result_dict": {
175175
"ISL=1024,OSL=1024,TP=8,CONC=16": {
176176
"total_throughput_per_sec": "2000",
177-
"mean_ttft_ms": "300",
178-
"mean_tpot_ms": "12"
177+
"mean_ttft_ms": "850",
178+
"mean_tpot_ms": "18"
179179
},
180180
"ISL=1024,OSL=1024,TP=8,CONC=32": {
181181
"total_throughput_per_sec": "3435",
182-
"mean_ttft_ms": "280",
183-
"mean_tpot_ms": "11"
182+
"mean_ttft_ms": "80",
183+
"mean_tpot_ms": "10"
184184
},
185185
"ISL=1024,OSL=1024,TP=8,CONC=64": {
186186
"total_throughput_per_sec": "5840",
@@ -189,13 +189,13 @@
189189
},
190190
"ISL=1024,OSL=8192,TP=8,CONC=16": {
191191
"total_throughput_per_sec": "1119",
192-
"mean_ttft_ms": "350",
193-
"mean_tpot_ms": "18"
192+
"mean_ttft_ms": "415",
193+
"mean_tpot_ms": "25"
194194
},
195195
"ISL=1024,OSL=8192,TP=8,CONC=32": {
196196
"total_throughput_per_sec": "1876",
197-
"mean_ttft_ms": "330",
198-
"mean_tpot_ms": "16"
197+
"mean_ttft_ms": "70",
198+
"mean_tpot_ms": "10"
199199
},
200200
"ISL=1024,OSL=8192,TP=8,CONC=64": {
201201
"total_throughput_per_sec": "3139",
@@ -204,18 +204,18 @@
204204
},
205205
"ISL=8192,OSL=1024,TP=8,CONC=16": {
206206
"total_throughput_per_sec": "7476",
207-
"mean_ttft_ms": "400",
208-
"mean_tpot_ms": "25"
207+
"mean_ttft_ms": "300",
208+
"mean_tpot_ms": "21"
209209
},
210210
"ISL=8192,OSL=1024,TP=8,CONC=32": {
211211
"total_throughput_per_sec": "11312",
212-
"mean_ttft_ms": "380",
213-
"mean_tpot_ms": "23"
212+
"mean_ttft_ms": "355",
213+
"mean_tpot_ms": "27"
214214
},
215215
"ISL=8192,OSL=1024,TP=8,CONC=64": {
216216
"total_throughput_per_sec": "16082",
217-
"mean_ttft_ms": "360",
218-
"mean_tpot_ms": "21"
217+
"mean_ttft_ms": "450",
218+
"mean_tpot_ms": "39"
219219
}
220220
}
221221
},

cvs/lib/inference/base.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def __init__(
4949
hf_token,
5050
gpu_type='mi300',
5151
distributed_inference=False,
52-
server_launch_poll_count=20,
52+
server_launch_poll_count=30,
5353
):
5454
# Client instance phdl
5555
self.c_phdl = c_phdl
@@ -133,7 +133,7 @@ def __init__(
133133
# Allow derived classes to override server launch wait duration
134134
self.default_server_precheck_wait_time = 30
135135
self.default_server_wait_time = 330
136-
self.default_server_poll_wait_time = 60
136+
self.default_server_poll_wait_time = 120
137137
self.default_server_poll_count = server_launch_poll_count
138138
self.default_server_precheck_error_pattern = re.compile(
139139
'no such file or directory|command not found|cannot access|permission denied|error:|exception:|traceback|failed to start',
@@ -749,9 +749,9 @@ def verify_inference_results(
749749
print(f"✓ All validations passed for {config_key}")
750750
print(self.inference_results_dict)
751751
# Auto-store results
752-
self.collect_test_result("success")
752+
self.collect_test_result()
753753
else:
754754
print(f"✗ Validations failed for {config_key}")
755755
print(self.inference_results_dict)
756756
# Auto-store results even on failure
757-
self.collect_test_result("failed")
757+
self.collect_test_result()

cvs/lib/inference/unittests/test_vllm.py

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -179,25 +179,20 @@ def test_print_all_results_with_single_result(self, mock_print, mock_update):
179179
"""Test printing with a single test result."""
180180
InferenceBaseJob.all_test_results = {
181181
('gpt-oss-120b', 'mi355x', '1024', '8192', 'long_generation', 32): {
182-
'status': 'success',
183-
'results': {
184-
'node1': {
185-
'successful_requests': '640',
186-
'total_throughput_per_sec': '4038',
187-
'mean_ttft_ms': '230',
188-
'mean_tpot_ms': '13',
189-
'p99_itl_ms': '150',
190-
}
191-
},
182+
'node1': {
183+
'successful_requests': '640',
184+
'total_throughput_per_sec': '4038',
185+
'mean_ttft_ms': '230',
186+
'mean_tpot_ms': '13',
187+
'p99_itl_ms': '150',
188+
}
192189
}
193190
}
194191

195192
VllmJob.print_all_results()
196193

197194
# Check that table was printed
198-
table_printed = any(
199-
'success' in str(call) and 'gpt-oss-120b' in str(call) for call in mock_print.call_args_list
200-
)
195+
table_printed = any('gpt-oss-120b' in str(call) for call in mock_print.call_args_list)
201196
self.assertTrue(table_printed or len(mock_print.call_args_list) > 0)
202197

203198
@patch('cvs.lib.inference.vllm.update_test_result')
@@ -206,28 +201,22 @@ def test_print_all_results_with_multiple_results(self, mock_print, mock_update):
206201
"""Test printing with multiple test results."""
207202
InferenceBaseJob.all_test_results = {
208203
('gpt-oss-120b', 'mi355x', '1024', '8192', 'long_generation', 32): {
209-
'status': 'success',
210-
'results': {
211-
'node1': {
212-
'successful_requests': '640',
213-
'total_throughput_per_sec': '4038',
214-
'mean_ttft_ms': '230',
215-
'mean_tpot_ms': '13',
216-
'p99_itl_ms': '150',
217-
}
218-
},
204+
'node1': {
205+
'successful_requests': '640',
206+
'total_throughput_per_sec': '4038',
207+
'mean_ttft_ms': '230',
208+
'mean_tpot_ms': '13',
209+
'p99_itl_ms': '150',
210+
}
219211
},
220212
('gpt-oss-120b', 'mi355x', '8192', '1024', 'long_context', 16): {
221-
'status': 'success',
222-
'results': {
223-
'node1': {
224-
'successful_requests': '800',
225-
'total_throughput_per_sec': '16509',
226-
'mean_ttft_ms': '350',
227-
'mean_tpot_ms': '20',
228-
'p99_itl_ms': '200',
229-
}
230-
},
213+
'node1': {
214+
'successful_requests': '800',
215+
'total_throughput_per_sec': '16509',
216+
'mean_ttft_ms': '350',
217+
'mean_tpot_ms': '20',
218+
'p99_itl_ms': '200',
219+
}
231220
},
232221
}
233222

@@ -240,9 +229,7 @@ def test_print_all_results_with_multiple_results(self, mock_print, mock_update):
240229
class TestClearAllResults(unittest.TestCase):
241230
def test_clear_all_results(self):
242231
"""Test that clear_all_results empties the class variable."""
243-
InferenceBaseJob.all_test_results = {
244-
('test', 'gpu', '1024', '1024', 'balanced', 16): {'status': 'success', 'results': {}}
245-
}
232+
InferenceBaseJob.all_test_results = {('test', 'gpu', '1024', '1024', 'balanced', 16): {}}
246233

247234
VllmJob.clear_all_results()
248235

cvs/lib/inference/vllm.py

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def restart_server(self):
5353
self.build_server_inference_job_cmd()
5454
self.start_inference_server_job()
5555

56-
def collect_test_result(self, status="success"):
56+
def collect_test_result(self):
5757
"""
5858
Collect test results from the last poll_for_inference_completion call.
5959
@@ -75,8 +75,8 @@ def collect_test_result(self, status="success"):
7575
break
7676

7777
res_index = (self.model_name, self.gpu_type, isl, osl, seq_name, conc)
78-
# Store with the same structure as poll_for_inference_completion returns
79-
InferenceBaseJob.all_test_results[res_index] = {"status": status, "results": self.inference_results_dict}
78+
# Store results without status field
79+
InferenceBaseJob.all_test_results[res_index] = self.inference_results_dict
8080
else:
8181
print("WARNING: Cannot collect test results - inference_results_dict is empty or not populated")
8282

@@ -95,7 +95,6 @@ def print_all_results(cls):
9595

9696
rows = []
9797
headers = [
98-
"Status",
9998
"Model",
10099
"GPU",
101100
"ISL",
@@ -110,24 +109,22 @@ def print_all_results(cls):
110109
"P99 ITL (ms)",
111110
]
112111

113-
for (model, gpu, isl, osl, policy, conc), entry in cls.all_test_results.items():
114-
status = entry["status"]
115-
for host, m in entry["results"].items():
112+
for (model, gpu, isl, osl, policy, conc), results in cls.all_test_results.items():
113+
for host, m in results.items():
116114
rows.append(
117115
[
118-
status,
119116
model,
120117
gpu,
121118
isl,
122119
osl,
123120
policy,
124121
conc,
125122
host,
126-
m["successful_requests"],
127-
m["total_throughput_per_sec"],
128-
m["mean_ttft_ms"],
129-
m["mean_tpot_ms"],
130-
m["p99_itl_ms"],
123+
m.get("successful_requests", "N/A"),
124+
m.get("total_throughput_per_sec", "N/A"),
125+
m.get("mean_ttft_ms", "N/A"),
126+
m.get("mean_tpot_ms", "N/A"),
127+
m.get("p99_itl_ms", "N/A"),
131128
]
132129
)
133130

cvs/lib/parallel_ssh_lib.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,25 @@ def inform_unreachability(self, cmd_output):
105105
for host in self.unreachable_hosts:
106106
cmd_output[host] = cmd_output.get(host, "") + "\nABORT: Host Unreachable Error"
107107

108+
def _safe_iterator(self, iterator):
109+
"""
110+
Wrapper for iterators that may contain invalid UTF-8 bytes.
111+
Yields valid lines and skips malformed ones with a warning.
112+
"""
113+
# Convert to iterator (handles both lists and existing iterators safely)
114+
iterator = iter(iterator)
115+
while True:
116+
try:
117+
line = next(iterator)
118+
yield line
119+
except UnicodeDecodeError as e:
120+
print(f"Warning: Skipping malformed line due to UnicodeDecodeError: {e}")
121+
# Continue to next line
122+
continue
123+
except StopIteration:
124+
# End of iterator
125+
break
126+
108127
def _process_output(self, output, cmd=None, cmd_list=None, print_console=True):
109128
"""
110129
Helper method to process output from run_command, collect results, and handle pruning.
@@ -122,11 +141,11 @@ def _process_output(self, output, cmd=None, cmd_list=None, print_console=True):
122141
else:
123142
print(cmd)
124143
try:
125-
for line in item.stdout or []:
144+
for line in self._safe_iterator(item.stdout or []):
126145
if print_console:
127146
print(line)
128147
cmd_out_str += line.replace('\t', ' ') + '\n'
129-
for line in item.stderr or []:
148+
for line in self._safe_iterator(item.stderr or []):
130149
if print_console:
131150
print(line)
132151
cmd_out_str += line.replace('\t', ' ') + '\n'

0 commit comments

Comments
 (0)