Skip to content

Commit a72c0d6

Browse files
committed
Added support for topographical ordering of hostnames in mpi run, made the slurm sbatch scripts more generic and added convenience
scripts to convert nccl output to excel
1 parent 7bac3df commit a72c0d6

12 files changed

+1423
-3
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,7 @@ wandb/
153153

154154
# Enroot container image
155155
*.sqsh
156+
157+
*.csv
158+
submitted_jobs*.txt
159+
topo_sorted_hostnames.txt

micro-benchmarks/nccl-tests/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ You can skip this part if you use pre-built image on `public.ecr.aws/hpc-cloud/n
128128

129129
## 2. Running the NCCL Tests
130130

131+
Note: For a newer version of nccl tests, with features like export to csv,
132+
passing in a topologically sorted hostfile to mpirun, and configurable AWS region support,
133+
look in slurm/v2/
134+
131135
### Slurm with container
132136

133137
Copy the file `slurm/nccl-tests.sbatch` or its content on your cluster then submit a preprocessing jobs with the command below:
@@ -317,4 +321,4 @@ The formula defines the maximum theoretical bandwidth that can be achieved on di
317321
* `n` : number of ranks participating to the operation. (similar to nranks for Algbw and Busbw)
318322
* `t` : time to complete the operation. (similar to sec for Algbw and Busbw)
319323
* `S` : number of elements being communicated (similar to count for Algbw and Busbw)
320-
* `B` : theoretical peak bandwidth.
324+
* `B` : theoretical peak bandwidth.
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Complete NCCL to CSV Converter
4+
Parses NCCL output and creates CSV files with results and summary
5+
"""
6+
7+
import re
8+
import csv
9+
import sys
10+
from pathlib import Path
11+
12+
def parse_nccl_output(file_path):
13+
"""Parse NCCL test output and extract performance data"""
14+
15+
data = []
16+
avg_bandwidth = None
17+
18+
# Pattern to match NCCL performance lines (flexible for different test types)
19+
# Handles both allreduce/reducescatter format and allgather/alltoall format
20+
# Note: alltoall uses N/A for in-place errors, so we handle that case
21+
pattern = r'^\s*(\d+)\s+(\d+)\s+(float|double|int|half)\s+(sum|prod|max|min|none)\s+(-?\d+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)'
22+
23+
# Pattern to match average bandwidth line
24+
avg_pattern = r'# Avg bus bandwidth\s*:\s*(\d+\.?\d*)'
25+
26+
try:
27+
with open(file_path, 'r') as f:
28+
for line_num, line in enumerate(f, 1):
29+
# Check for performance data
30+
match = re.match(pattern, line.strip())
31+
if match:
32+
size_bytes = int(match.group(1))
33+
count = int(match.group(2))
34+
data_type = match.group(3)
35+
operation = match.group(4)
36+
root = int(match.group(5))
37+
38+
# Out-of-place metrics
39+
oop_time_us = float(match.group(6))
40+
oop_algbw = float(match.group(7))
41+
oop_busbw = float(match.group(8))
42+
oop_error = 0 if match.group(9) == 'N/A' else int(match.group(9))
43+
44+
# In-place metrics
45+
ip_time_us = float(match.group(10))
46+
ip_algbw = float(match.group(11))
47+
ip_busbw = float(match.group(12))
48+
ip_error = 0 if match.group(13) == 'N/A' else int(match.group(13))
49+
50+
data.append({
51+
'Size_Bytes': size_bytes,
52+
'Size_KB': round(size_bytes / 1024, 2),
53+
'Size_MB': round(size_bytes / (1024 * 1024), 2),
54+
'Count': count,
55+
'Data_Type': data_type,
56+
'Operation': operation,
57+
'Root': root,
58+
'OOP_Time_us': oop_time_us,
59+
'OOP_AlgBW_GBps': oop_algbw,
60+
'OOP_BusBW_GBps': oop_busbw,
61+
'OOP_Errors': oop_error,
62+
'IP_Time_us': ip_time_us,
63+
'IP_AlgBW_GBps': ip_algbw,
64+
'IP_BusBW_GBps': ip_busbw,
65+
'IP_Errors': ip_error
66+
})
67+
68+
# Check for average bandwidth
69+
avg_match = re.search(avg_pattern, line)
70+
if avg_match:
71+
avg_bandwidth = float(avg_match.group(1))
72+
73+
except FileNotFoundError:
74+
print(f"Error: File {file_path} not found")
75+
return None, None
76+
except Exception as e:
77+
print(f"Error reading file: {e}")
78+
return None, None
79+
80+
if not data:
81+
print("No NCCL performance data found in the file")
82+
return None, None
83+
84+
return data, avg_bandwidth
85+
86+
def write_csv(data, filename):
87+
"""Write data to CSV file"""
88+
89+
if not data:
90+
return False
91+
92+
try:
93+
with open(filename, 'w', newline='') as csvfile:
94+
fieldnames = list(data[0].keys())
95+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
96+
writer.writeheader()
97+
writer.writerows(data)
98+
return True
99+
except Exception as e:
100+
print(f"Error writing CSV file {filename}: {e}")
101+
return False
102+
103+
def create_summary_data(data, avg_bandwidth=None):
104+
"""Create summary statistics from performance data"""
105+
106+
if not data:
107+
return None
108+
109+
oop_busbw_values = [row['OOP_BusBW_GBps'] for row in data]
110+
ip_busbw_values = [row['IP_BusBW_GBps'] for row in data]
111+
112+
summary_data = [
113+
{'Metric': 'Total Test Points', 'Value': len(data)},
114+
{'Metric': 'Min Message Size (Bytes)', 'Value': min(row['Size_Bytes'] for row in data)},
115+
{'Metric': 'Max Message Size (Bytes)', 'Value': max(row['Size_Bytes'] for row in data)},
116+
{'Metric': 'Peak OOP Bus BW (GB/s)', 'Value': round(max(oop_busbw_values), 2)},
117+
{'Metric': 'Peak IP Bus BW (GB/s)', 'Value': round(max(ip_busbw_values), 2)},
118+
{'Metric': 'Avg OOP Bus BW (GB/s)', 'Value': round(sum(oop_busbw_values) / len(oop_busbw_values), 2)},
119+
{'Metric': 'Avg IP Bus BW (GB/s)', 'Value': round(sum(ip_busbw_values) / len(ip_busbw_values), 2)},
120+
{'Metric': 'Total Errors', 'Value': sum(row['OOP_Errors'] + row['IP_Errors'] for row in data)}
121+
]
122+
123+
if avg_bandwidth is not None:
124+
summary_data.append({'Metric': 'NCCL Reported Avg Bus BW (GB/s)', 'Value': avg_bandwidth})
125+
126+
return summary_data
127+
128+
def main():
129+
if len(sys.argv) != 2:
130+
print("Usage: python nccl_to_excel.py <nccl_output_file>")
131+
print("Example: python nccl_to_excel.py nccl-tests-container_3480.out")
132+
sys.exit(1)
133+
134+
input_file = sys.argv[1]
135+
base_name = Path(input_file).stem
136+
137+
print(f"Parsing NCCL output from: {input_file}")
138+
139+
# Parse the NCCL output
140+
data, avg_bandwidth = parse_nccl_output(input_file)
141+
142+
if data is None:
143+
sys.exit(1)
144+
145+
print(f"Found {len(data)} performance data points")
146+
if avg_bandwidth:
147+
print(f"Average bus bandwidth: {avg_bandwidth} GB/s")
148+
149+
# Create main results CSV file
150+
results_file = f"{base_name}_results.csv"
151+
if write_csv(data, results_file):
152+
print(f"Results exported to: {results_file}")
153+
else:
154+
print("Error writing results file")
155+
sys.exit(1)
156+
157+
# Create summary CSV file
158+
summary_data = create_summary_data(data, avg_bandwidth)
159+
if summary_data:
160+
summary_file = f"{base_name}_summary.csv"
161+
if write_csv(summary_data, summary_file):
162+
print(f"Summary exported to: {summary_file}")
163+
else:
164+
print("Error writing summary file")
165+
166+
print("\nFiles created:")
167+
print(f"- {results_file} (detailed performance data)")
168+
print(f"- {summary_file} (summary statistics)")
169+
print("\nYou can open these CSV files in Excel, LibreOffice Calc, or any spreadsheet application")
170+
171+
if __name__ == "__main__":
172+
main()

micro-benchmarks/nccl-tests/slurm/nccl-tests-ami.sbatch

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ set -ex
1313

1414
# This script is designed to run by default on the Deep Learning AMI, Ubuntu 20.04
1515
# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
16-
ALL_REDUCE_BINARY=${1:-/usr/local/cuda-12.4/efa/test-cuda-12.4/all_reduce_perf}
17-
ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-12.4/lib}
16+
ALL_REDUCE_BINARY=${1:-/opt/nccl-tests/build/all_reduce_perf}
17+
ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-12.9/lib64:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/:/opt/nccl/build/lib}
1818

1919
# Get Hostname to Instance ID mapping
2020
mpirun -N 1 bash -c 'echo $(hostname) ➡️ $(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")'

0 commit comments

Comments
 (0)