1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete NCCL to CSV Converter
4
+ Parses NCCL output and creates CSV files with results and summary
5
+ """
6
+
7
+ import re
8
+ import csv
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ def parse_nccl_output (file_path ):
13
+ """Parse NCCL test output and extract performance data"""
14
+
15
+ data = []
16
+ avg_bandwidth = None
17
+
18
+ # Pattern to match NCCL performance lines (flexible for different test types)
19
+ # Handles both allreduce/reducescatter format and allgather/alltoall format
20
+ # Note: alltoall uses N/A for in-place errors, so we handle that case
21
+ pattern = r'^\s*(\d+)\s+(\d+)\s+(float|double|int|half)\s+(sum|prod|max|min|none)\s+(-?\d+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)'
22
+
23
+ # Pattern to match average bandwidth line
24
+ avg_pattern = r'# Avg bus bandwidth\s*:\s*(\d+\.?\d*)'
25
+
26
+ try :
27
+ with open (file_path , 'r' ) as f :
28
+ for line_num , line in enumerate (f , 1 ):
29
+ # Check for performance data
30
+ match = re .match (pattern , line .strip ())
31
+ if match :
32
+ size_bytes = int (match .group (1 ))
33
+ count = int (match .group (2 ))
34
+ data_type = match .group (3 )
35
+ operation = match .group (4 )
36
+ root = int (match .group (5 ))
37
+
38
+ # Out-of-place metrics
39
+ oop_time_us = float (match .group (6 ))
40
+ oop_algbw = float (match .group (7 ))
41
+ oop_busbw = float (match .group (8 ))
42
+ oop_error = 0 if match .group (9 ) == 'N/A' else int (match .group (9 ))
43
+
44
+ # In-place metrics
45
+ ip_time_us = float (match .group (10 ))
46
+ ip_algbw = float (match .group (11 ))
47
+ ip_busbw = float (match .group (12 ))
48
+ ip_error = 0 if match .group (13 ) == 'N/A' else int (match .group (13 ))
49
+
50
+ data .append ({
51
+ 'Size_Bytes' : size_bytes ,
52
+ 'Size_KB' : round (size_bytes / 1024 , 2 ),
53
+ 'Size_MB' : round (size_bytes / (1024 * 1024 ), 2 ),
54
+ 'Count' : count ,
55
+ 'Data_Type' : data_type ,
56
+ 'Operation' : operation ,
57
+ 'Root' : root ,
58
+ 'OOP_Time_us' : oop_time_us ,
59
+ 'OOP_AlgBW_GBps' : oop_algbw ,
60
+ 'OOP_BusBW_GBps' : oop_busbw ,
61
+ 'OOP_Errors' : oop_error ,
62
+ 'IP_Time_us' : ip_time_us ,
63
+ 'IP_AlgBW_GBps' : ip_algbw ,
64
+ 'IP_BusBW_GBps' : ip_busbw ,
65
+ 'IP_Errors' : ip_error
66
+ })
67
+
68
+ # Check for average bandwidth
69
+ avg_match = re .search (avg_pattern , line )
70
+ if avg_match :
71
+ avg_bandwidth = float (avg_match .group (1 ))
72
+
73
+ except FileNotFoundError :
74
+ print (f"Error: File { file_path } not found" )
75
+ return None , None
76
+ except Exception as e :
77
+ print (f"Error reading file: { e } " )
78
+ return None , None
79
+
80
+ if not data :
81
+ print ("No NCCL performance data found in the file" )
82
+ return None , None
83
+
84
+ return data , avg_bandwidth
85
+
86
+ def write_csv (data , filename ):
87
+ """Write data to CSV file"""
88
+
89
+ if not data :
90
+ return False
91
+
92
+ try :
93
+ with open (filename , 'w' , newline = '' ) as csvfile :
94
+ fieldnames = list (data [0 ].keys ())
95
+ writer = csv .DictWriter (csvfile , fieldnames = fieldnames )
96
+ writer .writeheader ()
97
+ writer .writerows (data )
98
+ return True
99
+ except Exception as e :
100
+ print (f"Error writing CSV file { filename } : { e } " )
101
+ return False
102
+
103
+ def create_summary_data (data , avg_bandwidth = None ):
104
+ """Create summary statistics from performance data"""
105
+
106
+ if not data :
107
+ return None
108
+
109
+ oop_busbw_values = [row ['OOP_BusBW_GBps' ] for row in data ]
110
+ ip_busbw_values = [row ['IP_BusBW_GBps' ] for row in data ]
111
+
112
+ summary_data = [
113
+ {'Metric' : 'Total Test Points' , 'Value' : len (data )},
114
+ {'Metric' : 'Min Message Size (Bytes)' , 'Value' : min (row ['Size_Bytes' ] for row in data )},
115
+ {'Metric' : 'Max Message Size (Bytes)' , 'Value' : max (row ['Size_Bytes' ] for row in data )},
116
+ {'Metric' : 'Peak OOP Bus BW (GB/s)' , 'Value' : round (max (oop_busbw_values ), 2 )},
117
+ {'Metric' : 'Peak IP Bus BW (GB/s)' , 'Value' : round (max (ip_busbw_values ), 2 )},
118
+ {'Metric' : 'Avg OOP Bus BW (GB/s)' , 'Value' : round (sum (oop_busbw_values ) / len (oop_busbw_values ), 2 )},
119
+ {'Metric' : 'Avg IP Bus BW (GB/s)' , 'Value' : round (sum (ip_busbw_values ) / len (ip_busbw_values ), 2 )},
120
+ {'Metric' : 'Total Errors' , 'Value' : sum (row ['OOP_Errors' ] + row ['IP_Errors' ] for row in data )}
121
+ ]
122
+
123
+ if avg_bandwidth is not None :
124
+ summary_data .append ({'Metric' : 'NCCL Reported Avg Bus BW (GB/s)' , 'Value' : avg_bandwidth })
125
+
126
+ return summary_data
127
+
128
+ def main ():
129
+ if len (sys .argv ) != 2 :
130
+ print ("Usage: python nccl_to_excel.py <nccl_output_file>" )
131
+ print ("Example: python nccl_to_excel.py nccl-tests-container_3480.out" )
132
+ sys .exit (1 )
133
+
134
+ input_file = sys .argv [1 ]
135
+ base_name = Path (input_file ).stem
136
+
137
+ print (f"Parsing NCCL output from: { input_file } " )
138
+
139
+ # Parse the NCCL output
140
+ data , avg_bandwidth = parse_nccl_output (input_file )
141
+
142
+ if data is None :
143
+ sys .exit (1 )
144
+
145
+ print (f"Found { len (data )} performance data points" )
146
+ if avg_bandwidth :
147
+ print (f"Average bus bandwidth: { avg_bandwidth } GB/s" )
148
+
149
+ # Create main results CSV file
150
+ results_file = f"{ base_name } _results.csv"
151
+ if write_csv (data , results_file ):
152
+ print (f"Results exported to: { results_file } " )
153
+ else :
154
+ print ("Error writing results file" )
155
+ sys .exit (1 )
156
+
157
+ # Create summary CSV file
158
+ summary_data = create_summary_data (data , avg_bandwidth )
159
+ if summary_data :
160
+ summary_file = f"{ base_name } _summary.csv"
161
+ if write_csv (summary_data , summary_file ):
162
+ print (f"Summary exported to: { summary_file } " )
163
+ else :
164
+ print ("Error writing summary file" )
165
+
166
+ print ("\n Files created:" )
167
+ print (f"- { results_file } (detailed performance data)" )
168
+ print (f"- { summary_file } (summary statistics)" )
169
+ print ("\n You can open these CSV files in Excel, LibreOffice Calc, or any spreadsheet application" )
170
+
171
+ if __name__ == "__main__" :
172
+ main ()
0 commit comments