Skip to content

Commit 7915959

Browse files
committed
Fixing tests
1 parent e6f97d9 commit 7915959

File tree

3 files changed

+96
-70
lines changed

3 files changed

+96
-70
lines changed

projects/rocprofiler-sdk/tests/rocprofv3/mpi-ranks/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ if(MPIRUN_EXECUTABLE)
2828
OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1)
2929

3030
# Test with MPI - profile only rank 0 out of 4 ranks
31+
# Use %env{OMPI_COMM_WORLD_RANK}% to create separate output directories per rank
3132
rocprofiler_add_integration_execute_test(
3233
rocprofv3-test-mpi-ranks-with-mpi
3334
COMMAND
3435
${CMAKE_COMMAND} -E env ${MPI_ENV_VARS}
3536
${MPIRUN_EXECUTABLE} -n 4
3637
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --hip-trace --kernel-trace
37-
--output-format csv json -d ${CMAKE_CURRENT_BINARY_DIR}/mpi-ranks-trace
38+
--output-format csv json -d ${CMAKE_CURRENT_BINARY_DIR}/mpi-ranks-trace/rank.%env{OMPI_COMM_WORLD_RANK}%
3839
-o out --mpi-ranks 0 ${PRELOAD_ARGS} -- $<TARGET_FILE:simple-transpose>
3940
DEPENDS simple-transpose
4041
TIMEOUT 120
@@ -43,13 +44,14 @@ if(MPIRUN_EXECUTABLE)
4344
FIXTURES_SETUP rocprofv3-test-mpi-ranks-with-mpi)
4445

4546
# Test with MPI - profile ranks 0-1,3 out of 4 ranks
47+
# Use %env{OMPI_COMM_WORLD_RANK}% to create separate output directories per rank
4648
rocprofiler_add_integration_execute_test(
4749
rocprofv3-test-mpi-ranks-with-mpi-multiple
4850
COMMAND
4951
${CMAKE_COMMAND} -E env ${MPI_ENV_VARS}
5052
${MPIRUN_EXECUTABLE} -n 4
5153
$<TARGET_FILE:rocprofiler-sdk::rocprofv3> --hip-trace --kernel-trace
52-
--output-format csv json -d ${CMAKE_CURRENT_BINARY_DIR}/mpi-ranks-multiple-trace
54+
--output-format csv json -d ${CMAKE_CURRENT_BINARY_DIR}/mpi-ranks-multiple-trace/rank.%env{OMPI_COMM_WORLD_RANK}%
5355
-o out --mpi-ranks 0-1,3 ${PRELOAD_ARGS} -- $<TARGET_FILE:simple-transpose>
5456
DEPENDS simple-transpose
5557
TIMEOUT 120
Binary file not shown.

projects/rocprofiler-sdk/tests/rocprofv3/mpi-ranks/validate.py

Lines changed: 92 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,41 @@ def get_gpu_node_count():
104104
return None
105105

106106

107+
def load_json_file(filepath):
108+
"""
109+
Load JSON from a file, handling cases where multiple MPI ranks may have written
110+
to the same file (resulting in concatenated JSON objects).
111+
112+
Returns the first valid JSON object found, or None if no valid JSON exists.
113+
"""
114+
try:
115+
with open(filepath, 'r') as f:
116+
content = f.read()
117+
118+
# Try to load as a single JSON object first
119+
try:
120+
return json.loads(content)
121+
except json.JSONDecodeError as e:
122+
# If that fails, it might be multiple JSON objects concatenated
123+
# Try to extract the first valid JSON object
124+
decoder = json.JSONDecoder()
125+
try:
126+
obj, idx = decoder.raw_decode(content)
127+
# Successfully decoded the first JSON object
128+
return obj
129+
except json.JSONDecodeError:
130+
# Even the first object is malformed
131+
return None
132+
except (IOError, OSError):
133+
return None
134+
135+
107136
def get_sdk_data(data):
108137
"""
109138
Extract rocprofiler-sdk-tool data from JSON, handling both dict and list structures.
110139
Some JSON files have rocprofiler-sdk-tool as a list, others as a dict.
111140
"""
112-
if "rocprofiler-sdk-tool" not in data:
141+
if data is None or "rocprofiler-sdk-tool" not in data:
113142
return None
114143

115144
sdk_data = data["rocprofiler-sdk-tool"]
@@ -135,28 +164,37 @@ def test_mpi_ranks_feature(output_dir, test_mode):
135164
- without-mpi: Non-MPI run, should generate output regardless
136165
"""
137166

138-
# Find all JSON output files in the output directory
139-
json_files = glob.glob(os.path.join(output_dir, "**/out_results.json"), recursive=True)
140-
141167
# Detect the number of GPU nodes in the system
142168
gpu_node_count = get_gpu_node_count()
143169
is_single_node = gpu_node_count is not None and gpu_node_count <= 1
144170

171+
# Find JSON output files
172+
# For MPI tests: Look only in rank.* subdirectories (to avoid stale files)
173+
# For non-MPI tests: Look everywhere in the output directory
174+
if test_mode in ["with-mpi-single", "with-mpi-multiple"]:
175+
# MPI test - only look in rank.* subdirectories
176+
json_files = []
177+
for rank_dir in glob.glob(os.path.join(output_dir, "rank.*")):
178+
json_files.extend(glob.glob(os.path.join(rank_dir, "**/out_results.json"), recursive=True))
179+
else:
180+
# Non-MPI test - look everywhere
181+
json_files = glob.glob(os.path.join(output_dir, "**/out_results.json"), recursive=True)
182+
145183
if test_mode == "with-mpi-single":
146-
# With --mpi-ranks 0 and 4 MPI ranks, only rank 0 should generate output
184+
# With --mpi-ranks 0 and 4 MPI ranks, only rank 0 should generate output in rank.0/
147185
# So we should have exactly 1 JSON file
148186
assert len(json_files) == 1, (
149187
f"Expected 1 JSON file for rank 0 only, but found {len(json_files)}: {json_files}"
150188
)
151189

152190
# Verify the file is from rank 0
153191
json_file = json_files[0]
154-
with open(json_file, 'r') as f:
155-
data = json.load(f)
192+
data = load_json_file(json_file)
193+
assert data is not None, f"Failed to load JSON from {json_file}"
156194

157195
# Check that we have valid profiling data
158196
sdk_data = get_sdk_data(data)
159-
assert sdk_data is not None, "Missing rocprofiler-sdk-tool data"
197+
assert sdk_data is not None, f"Missing rocprofiler-sdk-tool data in {json_file}"
160198
buffer_records = sdk_data.get("buffer_records", {})
161199

162200
# Should have some kernel or HIP API data
@@ -168,38 +206,23 @@ def test_mpi_ranks_feature(output_dir, test_mode):
168206

169207
elif test_mode == "with-mpi-multiple":
170208
# With --mpi-ranks 0-1,3 and 4 MPI ranks, ranks 0, 1, and 3 should generate output
209+
# Each rank uses a separate output directory (via %env{OMPI_COMM_WORLD_RANK}%)
210+
# so we should always get exactly 3 files
171211
expected_files = 3
172212

173-
if is_single_node:
174-
# On single-node systems, MPI ranks share the same output directory and may overwrite
175-
# each other's files, resulting in only 1 file (the last rank to write)
176-
print(f"INFO: Single GPU node detected (GPU count: {gpu_node_count})")
177-
print("INFO: On single-node systems, MPI ranks may share output directory")
213+
if gpu_node_count is not None:
214+
print(f"INFO: Detected {gpu_node_count} GPU node(s)")
178215

179-
# Accept 1 file (ranks overwriting each other) on single node systems
180-
assert len(json_files) >= 1, (
181-
f"Expected at least 1 JSON file on single-node system, but found {len(json_files)}: {json_files}"
182-
)
183-
184-
if len(json_files) < expected_files:
185-
print(f"INFO: Found {len(json_files)} file(s) instead of {expected_files} - expected on single-node setup")
186-
else:
187-
# On multi-node systems, each rank should have its own output directory
188-
if gpu_node_count is not None:
189-
print(f"INFO: Multiple GPU nodes detected (GPU count: {gpu_node_count})")
190-
else:
191-
print("INFO: Could not detect GPU count, assuming multi-node system")
192-
193-
# Require exactly the expected number of files on multi-node systems
194-
assert len(json_files) == expected_files, (
195-
f"Expected {expected_files} JSON files for ranks 0, 1, and 3 on multi-node system, "
196-
f"but found {len(json_files)}: {json_files}"
197-
)
216+
# With separate output directories per rank, we should have exactly the expected number
217+
assert len(json_files) == expected_files, (
218+
f"Expected {expected_files} JSON files for ranks 0, 1, and 3, "
219+
f"but found {len(json_files)}: {json_files}"
220+
)
198221

199222
# Verify each file has valid profiling data
200223
for json_file in json_files:
201-
with open(json_file, 'r') as f:
202-
data = json.load(f)
224+
data = load_json_file(json_file)
225+
assert data is not None, f"Failed to load JSON from {json_file}"
203226

204227
sdk_data = get_sdk_data(data)
205228
assert sdk_data is not None, f"Missing rocprofiler-sdk-tool data in {json_file}"
@@ -221,11 +244,11 @@ def test_mpi_ranks_feature(output_dir, test_mode):
221244

222245
# Verify the file has valid profiling data
223246
json_file = json_files[0]
224-
with open(json_file, 'r') as f:
225-
data = json.load(f)
247+
data = load_json_file(json_file)
248+
assert data is not None, f"Failed to load JSON from {json_file}"
226249

227250
sdk_data = get_sdk_data(data)
228-
assert sdk_data is not None, "Missing rocprofiler-sdk-tool data"
251+
assert sdk_data is not None, f"Missing rocprofiler-sdk-tool data in {json_file}"
229252
buffer_records = sdk_data.get("buffer_records", {})
230253

231254
# Should have some kernel or HIP API data
@@ -244,12 +267,17 @@ def test_csv_output_consistency(output_dir, test_mode):
244267
Verify that CSV files are also correctly generated/not generated based on rank filtering.
245268
"""
246269

247-
# Find all kernel trace CSV files
248-
csv_files = glob.glob(os.path.join(output_dir, "**/out_kernel_trace.csv"), recursive=True)
249-
250-
# Detect the number of GPU nodes in the system
251-
gpu_node_count = get_gpu_node_count()
252-
is_single_node = gpu_node_count is not None and gpu_node_count <= 1
270+
# Find CSV files
271+
# For MPI tests: Look only in rank.* subdirectories
272+
# For non-MPI tests: Look everywhere in the output directory
273+
if test_mode in ["with-mpi-single", "with-mpi-multiple"]:
274+
# MPI test - only look in rank.* subdirectories
275+
csv_files = []
276+
for rank_dir in glob.glob(os.path.join(output_dir, "rank.*")):
277+
csv_files.extend(glob.glob(os.path.join(rank_dir, "**/out_kernel_trace.csv"), recursive=True))
278+
else:
279+
# Non-MPI test - look everywhere
280+
csv_files = glob.glob(os.path.join(output_dir, "**/out_kernel_trace.csv"), recursive=True)
253281

254282
if test_mode == "with-mpi-single":
255283
# Only rank 0 should have CSV output
@@ -258,19 +286,12 @@ def test_csv_output_consistency(output_dir, test_mode):
258286
)
259287

260288
elif test_mode == "with-mpi-multiple":
289+
# Each rank has separate output directory, so expect exactly 3 CSV files
261290
expected_files = 3
262-
263-
if is_single_node:
264-
# On single-node systems, accept 1 or more files
265-
assert len(csv_files) >= 1, (
266-
f"Expected at least 1 CSV file on single-node system, but found {len(csv_files)}: {csv_files}"
267-
)
268-
else:
269-
# On multi-node systems, require exactly the expected number
270-
assert len(csv_files) == expected_files, (
271-
f"Expected {expected_files} CSV files for ranks 0, 1, and 3 on multi-node system, "
272-
f"but found {len(csv_files)}: {csv_files}"
273-
)
291+
assert len(csv_files) == expected_files, (
292+
f"Expected {expected_files} CSV files for ranks 0, 1, and 3, "
293+
f"but found {len(csv_files)}: {csv_files}"
294+
)
274295

275296
elif test_mode == "without-mpi":
276297
# Non-MPI run should have CSV output
@@ -282,26 +303,29 @@ def test_csv_output_consistency(output_dir, test_mode):
282303
def test_no_output_for_filtered_ranks(output_dir, test_mode):
283304
"""
284305
Verify that ranks not in the --mpi-ranks list do not generate output.
285-
This test is skipped on single-node systems where ranks may share output directories.
306+
Since each rank has a separate output directory (via %env{OMPI_COMM_WORLD_RANK}%),
307+
we can check that rank 2's directory doesn't exist or is empty.
286308
"""
287309

288310
if test_mode != "with-mpi-multiple":
289311
pytest.skip("This test only applies to with-mpi-multiple mode")
290312

291-
# Detect if we're on a single-node system
292-
gpu_node_count = get_gpu_node_count()
293-
is_single_node = gpu_node_count is not None and gpu_node_count <= 1
294-
295-
if is_single_node:
296-
pytest.skip("Skipping filtered ranks test on single-node system (ranks share output directory)")
313+
# Check for rank.2 directory (which should NOT have been created or should be empty)
314+
rank_2_dir = os.path.join(output_dir, "rank.2")
297315

298-
# In with-mpi-multiple mode with --mpi-ranks 0-1,3, rank 2 should NOT generate output
299-
json_files = glob.glob(os.path.join(output_dir, "**/out_results.json"), recursive=True)
316+
if os.path.exists(rank_2_dir):
317+
# Directory exists - check if it has any JSON files
318+
json_files_in_rank_2 = glob.glob(os.path.join(rank_2_dir, "**/out_results.json"), recursive=True)
319+
assert len(json_files_in_rank_2) == 0, (
320+
f"Rank 2 should not generate output, but found {len(json_files_in_rank_2)} files: {json_files_in_rank_2}"
321+
)
300322

301-
# On multi-node systems, we should have exactly 3 files (no output from rank 2)
302-
assert len(json_files) == 3, (
303-
f"Expected exactly 3 output files (ranks 0,1,3) on multi-node system, got {len(json_files)}"
304-
)
323+
# Verify that ranks 0, 1, and 3 directories exist with output
324+
for rank in [0, 1, 3]:
325+
rank_dir = os.path.join(output_dir, f"rank.{rank}")
326+
assert os.path.exists(rank_dir), f"Expected directory for rank {rank} at {rank_dir}"
327+
json_files = glob.glob(os.path.join(rank_dir, "**/out_results.json"), recursive=True)
328+
assert len(json_files) >= 1, f"Expected output files for rank {rank} in {rank_dir}"
305329

306330

307331
if __name__ == "__main__":

0 commit comments

Comments
 (0)