@@ -104,12 +104,41 @@ def get_gpu_node_count():
104104 return None
105105
106106
107+ def load_json_file (filepath ):
108+ """
109+ Load JSON from a file, handling cases where multiple MPI ranks may have written
110+ to the same file (resulting in concatenated JSON objects).
111+
112+ Returns the first valid JSON object found, or None if no valid JSON exists.
113+ """
114+ try :
115+ with open (filepath , 'r' ) as f :
116+ content = f .read ()
117+
118+ # Try to load as a single JSON object first
119+ try :
120+ return json .loads (content )
121+ except json .JSONDecodeError as e :
122+ # If that fails, it might be multiple JSON objects concatenated
123+ # Try to extract the first valid JSON object
124+ decoder = json .JSONDecoder ()
125+ try :
126+ obj , idx = decoder .raw_decode (content )
127+ # Successfully decoded the first JSON object
128+ return obj
129+ except json .JSONDecodeError :
130+ # Even the first object is malformed
131+ return None
132+ except (IOError , OSError ):
133+ return None
134+
135+
107136def get_sdk_data (data ):
108137 """
109138 Extract rocprofiler-sdk-tool data from JSON, handling both dict and list structures.
110139 Some JSON files have rocprofiler-sdk-tool as a list, others as a dict.
111140 """
112- if "rocprofiler-sdk-tool" not in data :
141+ if data is None or "rocprofiler-sdk-tool" not in data :
113142 return None
114143
115144 sdk_data = data ["rocprofiler-sdk-tool" ]
@@ -135,28 +164,37 @@ def test_mpi_ranks_feature(output_dir, test_mode):
135164 - without-mpi: Non-MPI run, should generate output regardless
136165 """
137166
138- # Find all JSON output files in the output directory
139- json_files = glob .glob (os .path .join (output_dir , "**/out_results.json" ), recursive = True )
140-
141167 # Detect the number of GPU nodes in the system
142168 gpu_node_count = get_gpu_node_count ()
143169 is_single_node = gpu_node_count is not None and gpu_node_count <= 1
144170
171+ # Find JSON output files
172+ # For MPI tests: Look only in rank.* subdirectories (to avoid stale files)
173+ # For non-MPI tests: Look everywhere in the output directory
174+ if test_mode in ["with-mpi-single" , "with-mpi-multiple" ]:
175+ # MPI test - only look in rank.* subdirectories
176+ json_files = []
177+ for rank_dir in glob .glob (os .path .join (output_dir , "rank.*" )):
178+ json_files .extend (glob .glob (os .path .join (rank_dir , "**/out_results.json" ), recursive = True ))
179+ else :
180+ # Non-MPI test - look everywhere
181+ json_files = glob .glob (os .path .join (output_dir , "**/out_results.json" ), recursive = True )
182+
145183 if test_mode == "with-mpi-single" :
146- # With --mpi-ranks 0 and 4 MPI ranks, only rank 0 should generate output
184+ # With --mpi-ranks 0 and 4 MPI ranks, only rank 0 should generate output in rank.0/
147185 # So we should have exactly 1 JSON file
148186 assert len (json_files ) == 1 , (
149187 f"Expected 1 JSON file for rank 0 only, but found { len (json_files )} : { json_files } "
150188 )
151189
152190 # Verify the file is from rank 0
153191 json_file = json_files [0 ]
154- with open (json_file , 'r' ) as f :
155- data = json . load ( f )
192+ data = load_json_file (json_file )
193+ assert data is not None , f"Failed to load JSON from { json_file } "
156194
157195 # Check that we have valid profiling data
158196 sdk_data = get_sdk_data (data )
159- assert sdk_data is not None , "Missing rocprofiler-sdk-tool data"
197+ assert sdk_data is not None , f "Missing rocprofiler-sdk-tool data in { json_file } "
160198 buffer_records = sdk_data .get ("buffer_records" , {})
161199
162200 # Should have some kernel or HIP API data
@@ -168,38 +206,23 @@ def test_mpi_ranks_feature(output_dir, test_mode):
168206
169207 elif test_mode == "with-mpi-multiple" :
170208 # With --mpi-ranks 0-1,3 and 4 MPI ranks, ranks 0, 1, and 3 should generate output
209+ # Each rank uses a separate output directory (via %env{OMPI_COMM_WORLD_RANK}%)
210+ # so we should always get exactly 3 files
171211 expected_files = 3
172212
173- if is_single_node :
174- # On single-node systems, MPI ranks share the same output directory and may overwrite
175- # each other's files, resulting in only 1 file (the last rank to write)
176- print (f"INFO: Single GPU node detected (GPU count: { gpu_node_count } )" )
177- print ("INFO: On single-node systems, MPI ranks may share output directory" )
213+ if gpu_node_count is not None :
214+ print (f"INFO: Detected { gpu_node_count } GPU node(s)" )
178215
179- # Accept 1 file (ranks overwriting each other) on single node systems
180- assert len (json_files ) >= 1 , (
181- f"Expected at least 1 JSON file on single-node system, but found { len (json_files )} : { json_files } "
182- )
183-
184- if len (json_files ) < expected_files :
185- print (f"INFO: Found { len (json_files )} file(s) instead of { expected_files } - expected on single-node setup" )
186- else :
187- # On multi-node systems, each rank should have its own output directory
188- if gpu_node_count is not None :
189- print (f"INFO: Multiple GPU nodes detected (GPU count: { gpu_node_count } )" )
190- else :
191- print ("INFO: Could not detect GPU count, assuming multi-node system" )
192-
193- # Require exactly the expected number of files on multi-node systems
194- assert len (json_files ) == expected_files , (
195- f"Expected { expected_files } JSON files for ranks 0, 1, and 3 on multi-node system, "
196- f"but found { len (json_files )} : { json_files } "
197- )
216+ # With separate output directories per rank, we should have exactly the expected number
217+ assert len (json_files ) == expected_files , (
218+ f"Expected { expected_files } JSON files for ranks 0, 1, and 3, "
219+ f"but found { len (json_files )} : { json_files } "
220+ )
198221
199222 # Verify each file has valid profiling data
200223 for json_file in json_files :
201- with open (json_file , 'r' ) as f :
202- data = json . load ( f )
224+ data = load_json_file (json_file )
225+ assert data is not None , f"Failed to load JSON from { json_file } "
203226
204227 sdk_data = get_sdk_data (data )
205228 assert sdk_data is not None , f"Missing rocprofiler-sdk-tool data in { json_file } "
@@ -221,11 +244,11 @@ def test_mpi_ranks_feature(output_dir, test_mode):
221244
222245 # Verify the file has valid profiling data
223246 json_file = json_files [0 ]
224- with open (json_file , 'r' ) as f :
225- data = json . load ( f )
247+ data = load_json_file (json_file )
248+ assert data is not None , f"Failed to load JSON from { json_file } "
226249
227250 sdk_data = get_sdk_data (data )
228- assert sdk_data is not None , "Missing rocprofiler-sdk-tool data"
251+ assert sdk_data is not None , f "Missing rocprofiler-sdk-tool data in { json_file } "
229252 buffer_records = sdk_data .get ("buffer_records" , {})
230253
231254 # Should have some kernel or HIP API data
@@ -244,12 +267,17 @@ def test_csv_output_consistency(output_dir, test_mode):
244267 Verify that CSV files are also correctly generated/not generated based on rank filtering.
245268 """
246269
247- # Find all kernel trace CSV files
248- csv_files = glob .glob (os .path .join (output_dir , "**/out_kernel_trace.csv" ), recursive = True )
249-
250- # Detect the number of GPU nodes in the system
251- gpu_node_count = get_gpu_node_count ()
252- is_single_node = gpu_node_count is not None and gpu_node_count <= 1
270+ # Find CSV files
271+ # For MPI tests: Look only in rank.* subdirectories
272+ # For non-MPI tests: Look everywhere in the output directory
273+ if test_mode in ["with-mpi-single" , "with-mpi-multiple" ]:
274+ # MPI test - only look in rank.* subdirectories
275+ csv_files = []
276+ for rank_dir in glob .glob (os .path .join (output_dir , "rank.*" )):
277+ csv_files .extend (glob .glob (os .path .join (rank_dir , "**/out_kernel_trace.csv" ), recursive = True ))
278+ else :
279+ # Non-MPI test - look everywhere
280+ csv_files = glob .glob (os .path .join (output_dir , "**/out_kernel_trace.csv" ), recursive = True )
253281
254282 if test_mode == "with-mpi-single" :
255283 # Only rank 0 should have CSV output
@@ -258,19 +286,12 @@ def test_csv_output_consistency(output_dir, test_mode):
258286 )
259287
260288 elif test_mode == "with-mpi-multiple" :
289+ # Each rank has separate output directory, so expect exactly 3 CSV files
261290 expected_files = 3
262-
263- if is_single_node :
264- # On single-node systems, accept 1 or more files
265- assert len (csv_files ) >= 1 , (
266- f"Expected at least 1 CSV file on single-node system, but found { len (csv_files )} : { csv_files } "
267- )
268- else :
269- # On multi-node systems, require exactly the expected number
270- assert len (csv_files ) == expected_files , (
271- f"Expected { expected_files } CSV files for ranks 0, 1, and 3 on multi-node system, "
272- f"but found { len (csv_files )} : { csv_files } "
273- )
291+ assert len (csv_files ) == expected_files , (
292+ f"Expected { expected_files } CSV files for ranks 0, 1, and 3, "
293+ f"but found { len (csv_files )} : { csv_files } "
294+ )
274295
275296 elif test_mode == "without-mpi" :
276297 # Non-MPI run should have CSV output
@@ -282,26 +303,29 @@ def test_csv_output_consistency(output_dir, test_mode):
282303def test_no_output_for_filtered_ranks (output_dir , test_mode ):
283304 """
284305 Verify that ranks not in the --mpi-ranks list do not generate output.
285- This test is skipped on single-node systems where ranks may share output directories.
306+ Since each rank has a separate output directory (via %env{OMPI_COMM_WORLD_RANK}%),
307+ we can check that rank 2's directory doesn't exist or is empty.
286308 """
287309
288310 if test_mode != "with-mpi-multiple" :
289311 pytest .skip ("This test only applies to with-mpi-multiple mode" )
290312
291- # Detect if we're on a single-node system
292- gpu_node_count = get_gpu_node_count ()
293- is_single_node = gpu_node_count is not None and gpu_node_count <= 1
294-
295- if is_single_node :
296- pytest .skip ("Skipping filtered ranks test on single-node system (ranks share output directory)" )
313+ # Check for rank.2 directory (which should NOT have been created or should be empty)
314+ rank_2_dir = os .path .join (output_dir , "rank.2" )
297315
298- # In with-mpi-multiple mode with --mpi-ranks 0-1,3, rank 2 should NOT generate output
299- json_files = glob .glob (os .path .join (output_dir , "**/out_results.json" ), recursive = True )
316+ if os .path .exists (rank_2_dir ):
317+ # Directory exists - check if it has any JSON files
318+ json_files_in_rank_2 = glob .glob (os .path .join (rank_2_dir , "**/out_results.json" ), recursive = True )
319+ assert len (json_files_in_rank_2 ) == 0 , (
320+ f"Rank 2 should not generate output, but found { len (json_files_in_rank_2 )} files: { json_files_in_rank_2 } "
321+ )
300322
301- # On multi-node systems, we should have exactly 3 files (no output from rank 2)
302- assert len (json_files ) == 3 , (
303- f"Expected exactly 3 output files (ranks 0,1,3) on multi-node system, got { len (json_files )} "
304- )
323+ # Verify that ranks 0, 1, and 3 directories exist with output
324+ for rank in [0 , 1 , 3 ]:
325+ rank_dir = os .path .join (output_dir , f"rank.{ rank } " )
326+ assert os .path .exists (rank_dir ), f"Expected directory for rank { rank } at { rank_dir } "
327+ json_files = glob .glob (os .path .join (rank_dir , "**/out_results.json" ), recursive = True )
328+ assert len (json_files ) >= 1 , f"Expected output files for rank { rank } in { rank_dir } "
305329
306330
307331if __name__ == "__main__" :
0 commit comments