Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,11 @@ wandb/

# Enroot container image
*.sqsh

*.csv
submitted_jobs*.txt
topo_sorted_hostnames.txt
micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostnames.txt
*.patch
micro-benchmarks/nccl-tests/slurm/find_bad_nodes/logs/analysis_summary_*.txt
micro-benchmarks/nccl-tests/slurm/find_bad_nodes/logs/node_combinations_*.txt
5 changes: 4 additions & 1 deletion micro-benchmarks/nccl-tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ You can skip this part if you use pre-built image on `public.ecr.aws/hpc-cloud/n

## 2. Running the NCCL Tests

Note: For topology aware NCCL tests, with features like export to csv,
passing in a topologically sorted hostfile to mpirun, look in slurm/topology-aware-nccl-tests

### Slurm with container

Copy the file `slurm/nccl-tests.sbatch` or its content on your cluster then submit a preprocessing jobs with the command below:
Expand Down Expand Up @@ -317,4 +320,4 @@ The formula defines the maximum theoretical bandwidth that can be achieved on di
* `n` : number of ranks participating to the operation. (similar to nranks for Algbw and Busbw)
* `t` : time to complete the operation. (similar to sec for Algbw and Busbw)
* `S` : number of elements being communicated (similar to count for Algbw and Busbw)
* `B` : theoretical peak bandwidth.
* `B` : theoretical peak bandwidth.
174 changes: 174 additions & 0 deletions micro-benchmarks/nccl-tests/nccl_to_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
"""
Complete NCCL to CSV Converter
Parses NCCL output and creates CSV files with results and summary
"""

import re
import csv
import sys
from pathlib import Path

def parse_nccl_output(file_path):
"""Parse NCCL test output and extract performance data"""

data = []
avg_bandwidth = None

# Pattern to match NCCL performance lines (flexible for different test types)
# Handles both allreduce/reducescatter format and allgather/alltoall format
# Note: alltoall uses N/A for in-place errors, so we handle that case
pattern = r'^\s*(\d+)\s+(\d+)\s+(float|double|int|half)\s+(sum|prod|max|min|none)\s+(-?\d+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)'

# Pattern to match average bandwidth line
avg_pattern = r'# Avg bus bandwidth\s*:\s*(\d+\.?\d*)'

try:
with open(file_path, 'r') as f:
for line_num, line in enumerate(f, 1):
# Check for performance data
match = re.match(pattern, line.strip())
if match:
size_bytes = int(match.group(1))
count = int(match.group(2))
data_type = match.group(3)
operation = match.group(4)
root = int(match.group(5))

# Out-of-place metrics
oop_time_us = float(match.group(6))
oop_algbw = float(match.group(7))
oop_busbw = float(match.group(8))
oop_error = 0 if match.group(9) == 'N/A' else int(match.group(9))

# In-place metrics
ip_time_us = float(match.group(10))
ip_algbw = float(match.group(11))
ip_busbw = float(match.group(12))
ip_error = 0 if match.group(13) == 'N/A' else int(match.group(13))

data.append({
'Size_Bytes': size_bytes,
'Size_KB': round(size_bytes / 1024, 2),
'Size_MB': round(size_bytes / (1024 * 1024), 2),
'Count': count,
'Data_Type': data_type,
'Operation': operation,
'Root': root,
'OOP_Time_us': oop_time_us,
'OOP_AlgBW_GBps': oop_algbw,
'OOP_BusBW_GBps': oop_busbw,
'OOP_Errors': oop_error,
'IP_Time_us': ip_time_us,
'IP_AlgBW_GBps': ip_algbw,
'IP_BusBW_GBps': ip_busbw,
'IP_Errors': ip_error
})

# Check for average bandwidth
avg_match = re.search(avg_pattern, line)
if avg_match:
avg_bandwidth = float(avg_match.group(1))

except FileNotFoundError:
print(f"Error: File {file_path} not found")
return None, None
except Exception as e:
print(f"Error reading file: {e}")
return None, None

if not data:
print("No NCCL performance data found in the file")
return None, None

return data, avg_bandwidth

def write_csv(data, filename):
"""Write data to CSV file"""

if not data:
return False

try:
with open(filename, 'w', newline='') as csvfile:
fieldnames = list(data[0].keys())
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
return True
except Exception as e:
print(f"Error writing CSV file {filename}: {e}")
return False

def create_summary_data(data, avg_bandwidth=None):
"""Create summary statistics from performance data"""

if not data:
return None

oop_busbw_values = [row['OOP_BusBW_GBps'] for row in data]
ip_busbw_values = [row['IP_BusBW_GBps'] for row in data]

summary_data = [
{'Metric': 'Total Test Points', 'Value': len(data)},
{'Metric': 'Min Message Size (Bytes)', 'Value': min(row['Size_Bytes'] for row in data)},
{'Metric': 'Max Message Size (Bytes)', 'Value': max(row['Size_Bytes'] for row in data)},
{'Metric': 'Peak OOP Bus BW (GB/s)', 'Value': round(max(oop_busbw_values), 2)},
{'Metric': 'Peak IP Bus BW (GB/s)', 'Value': round(max(ip_busbw_values), 2)},
{'Metric': 'Avg OOP Bus BW (GB/s)', 'Value': round(sum(oop_busbw_values) / len(oop_busbw_values), 2)},
{'Metric': 'Avg IP Bus BW (GB/s)', 'Value': round(sum(ip_busbw_values) / len(ip_busbw_values), 2)},
{'Metric': 'Total Errors', 'Value': sum(row['OOP_Errors'] + row['IP_Errors'] for row in data)}
]

if avg_bandwidth is not None:
summary_data.append({'Metric': 'NCCL Reported Avg Bus BW (GB/s)', 'Value': avg_bandwidth})

return summary_data

def main():
if len(sys.argv) != 2:
print("Usage: python nccl_to_excel.py <nccl_output_file>")
print("Example: python nccl_to_excel.py nccl-tests-container_3480.out")
sys.exit(1)

input_file = sys.argv[1]
base_name = Path(input_file).stem

print(f"Parsing NCCL output from: {input_file}")

# Parse the NCCL output
data, avg_bandwidth = parse_nccl_output(input_file)

if data is None:
sys.exit(1)

print(f"Found {len(data)} performance data points")
if avg_bandwidth:
print(f"Average bus bandwidth: {avg_bandwidth} GB/s")

# Create main results CSV file
results_file = f"{base_name}_results.csv"
if write_csv(data, results_file):
print(f"Results exported to: {results_file}")
else:
print("Error writing results file")
sys.exit(1)

# Create summary CSV file
summary_data = create_summary_data(data, avg_bandwidth)
if summary_data:
summary_file = f"{base_name}_summary.csv"
if write_csv(summary_data, summary_file):
print(f"Summary exported to: {summary_file}")
else:
print("Error writing summary file")

print("\nFiles created:")
print(f"- {results_file} (detailed performance data)")
print(f"- {summary_file} (summary statistics)")
print("\nYou can open these CSV files in Excel, LibreOffice Calc, or any spreadsheet application")

if __name__ == "__main__":
main()
Loading