From c5168b8efd081ff0143be8dc2b001f411634290c Mon Sep 17 00:00:00 2001
From: Harish Rao <harishvs1@gmail.com>
Date: Thu, 4 Sep 2025 13:19:46 -0700
Subject: [PATCH] Added support for topographical ordering of hostnames in mpi
 run, made the slurm sbatch scripts more generic and added convenience scripts
 to convert nccl output to excel

---
 .gitignore                                    |   8 +
 micro-benchmarks/nccl-tests/README.md         |   5 +-
 micro-benchmarks/nccl-tests/nccl_to_csv.py    | 174 +++++++++
 .../slurm/topology-aware-nccl-tests/README.md | 337 ++++++++++++++++++
 .../generate_hostfile.sh                      |  17 +
 .../hostfile-topologify.py                    | 161 +++++++++
 .../nccl-tests-ami.sbatch                     | 146 ++++++++
 .../nccl-tests-container.sbatch               | 116 ++++++
 .../process_nccl_results.sh                   | 275 ++++++++++++++
 .../submit_nccl_test_ami.sh                   | 113 ++++++
 .../submit_nccl_test_container.sh             | 106 ++++++
 11 files changed, 1457 insertions(+), 1 deletion(-)
 create mode 100644 micro-benchmarks/nccl-tests/nccl_to_csv.py
 create mode 100644 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md
 create mode 100755 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh
 create mode 100644 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile-topologify.py
 create mode 100644 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch
 create mode 100644 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch
 create mode 100755 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh
 create mode 100755 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh
 create mode 100755 micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh

diff --git a/.gitignore b/.gitignore
index daf3b51dd..3625babd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,11 @@ wandb/
 
 # Enroot container image
 *.sqsh
+
+*.csv
+submitted_jobs*.txt
+topo_sorted_hostnames.txt
+micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostnames.txt
+*.patch
+micro-benchmarks/nccl-tests/slurm/find_bad_nodes/logs/analysis_summary_*.txt
+micro-benchmarks/nccl-tests/slurm/find_bad_nodes/logs/node_combinations_*.txt
diff --git a/micro-benchmarks/nccl-tests/README.md b/micro-benchmarks/nccl-tests/README.md
index d50d5360f..6b1d3d87d 100644
--- a/micro-benchmarks/nccl-tests/README.md
+++ b/micro-benchmarks/nccl-tests/README.md
@@ -128,6 +128,9 @@ You can skip this part if you use pre-built image on `public.ecr.aws/hpc-cloud/n
 
 ## 2. Running the NCCL Tests
 
+Note: For topology aware NCCL tests, with features like export to csv, 
+passing in a topologically sorted hostfile to mpirun, look in slurm/topology-aware-nccl-tests
+
 ### Slurm with container
 
 Copy the file `slurm/nccl-tests.sbatch` or its content on your cluster then submit a preprocessing jobs with the command below:
@@ -317,4 +320,4 @@ The formula defines the maximum theoretical bandwidth that can be achieved on di
 * `n` : number of ranks participating to the operation. (similar to nranks for Algbw and Busbw)
 * `t` : time to complete the operation. (similar to sec for Algbw and Busbw)
 * `S` : number of elements being communicated (similar to count for Algbw and Busbw)
-* `B` : theoretical peak bandwidth.
+* `B` : theoretical peak bandwidth.
\ No newline at end of file
diff --git a/micro-benchmarks/nccl-tests/nccl_to_csv.py b/micro-benchmarks/nccl-tests/nccl_to_csv.py
new file mode 100644
index 000000000..59d6e4439
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/nccl_to_csv.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+"""
+Complete NCCL to CSV Converter
+Parses NCCL output and creates CSV files with results and summary
+"""
+
+import re
+import csv
+import sys
+from pathlib import Path
+
+def parse_nccl_output(file_path):
+    """Parse NCCL test output and extract performance data"""
+    
+    data = []
+    avg_bandwidth = None
+    
+    # Pattern to match NCCL performance lines (flexible for different test types)
+    # Handles both allreduce/reducescatter format and allgather/alltoall format
+    # Note: alltoall uses N/A for in-place errors, so we handle that case
+    pattern = r'^\s*(\d+)\s+(\d+)\s+(float|double|int|half)\s+(sum|prod|max|min|none)\s+(-?\d+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+\.?\d*)\s+(\d+|N/A)'
+    
+    # Pattern to match average bandwidth line
+    avg_pattern = r'# Avg bus bandwidth\s*:\s*(\d+\.?\d*)'
+    
+    try:
+        with open(file_path, 'r') as f:
+            for line_num, line in enumerate(f, 1):
+                # Check for performance data
+                match = re.match(pattern, line.strip())
+                if match:
+                    size_bytes = int(match.group(1))
+                    count = int(match.group(2))
+                    data_type = match.group(3)
+                    operation = match.group(4)
+                    root = int(match.group(5))
+                    
+                    # Out-of-place metrics
+                    oop_time_us = float(match.group(6))
+                    oop_algbw = float(match.group(7))
+                    oop_busbw = float(match.group(8))
+                    oop_error = 0 if match.group(9) == 'N/A' else int(match.group(9))
+                    
+                    # In-place metrics  
+                    ip_time_us = float(match.group(10))
+                    ip_algbw = float(match.group(11))
+                    ip_busbw = float(match.group(12))
+                    ip_error = 0 if match.group(13) == 'N/A' else int(match.group(13))
+                    
+                    data.append({
+                        'Size_Bytes': size_bytes,
+                        'Size_KB': round(size_bytes / 1024, 2),
+                        'Size_MB': round(size_bytes / (1024 * 1024), 2),
+                        'Count': count,
+                        'Data_Type': data_type,
+                        'Operation': operation,
+                        'Root': root,
+                        'OOP_Time_us': oop_time_us,
+                        'OOP_AlgBW_GBps': oop_algbw,
+                        'OOP_BusBW_GBps': oop_busbw,
+                        'OOP_Errors': oop_error,
+                        'IP_Time_us': ip_time_us,
+                        'IP_AlgBW_GBps': ip_algbw,
+                        'IP_BusBW_GBps': ip_busbw,
+                        'IP_Errors': ip_error
+                    })
+                
+                # Check for average bandwidth
+                avg_match = re.search(avg_pattern, line)
+                if avg_match:
+                    avg_bandwidth = float(avg_match.group(1))
+    
+    except FileNotFoundError:
+        print(f"Error: File {file_path} not found")
+        return None, None
+    except Exception as e:
+        print(f"Error reading file: {e}")
+        return None, None
+    
+    if not data:
+        print("No NCCL performance data found in the file")
+        return None, None
+        
+    return data, avg_bandwidth
+
+def write_csv(data, filename):
+    """Write data to CSV file"""
+    
+    if not data:
+        return False
+    
+    try:
+        with open(filename, 'w', newline='') as csvfile:
+            fieldnames = list(data[0].keys())
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(data)
+        return True
+    except Exception as e:
+        print(f"Error writing CSV file {filename}: {e}")
+        return False
+
+def create_summary_data(data, avg_bandwidth=None):
+    """Create summary statistics from performance data"""
+    
+    if not data:
+        return None
+        
+    oop_busbw_values = [row['OOP_BusBW_GBps'] for row in data]
+    ip_busbw_values = [row['IP_BusBW_GBps'] for row in data]
+    
+    summary_data = [
+        {'Metric': 'Total Test Points', 'Value': len(data)},
+        {'Metric': 'Min Message Size (Bytes)', 'Value': min(row['Size_Bytes'] for row in data)},
+        {'Metric': 'Max Message Size (Bytes)', 'Value': max(row['Size_Bytes'] for row in data)},
+        {'Metric': 'Peak OOP Bus BW (GB/s)', 'Value': round(max(oop_busbw_values), 2)},
+        {'Metric': 'Peak IP Bus BW (GB/s)', 'Value': round(max(ip_busbw_values), 2)},
+        {'Metric': 'Avg OOP Bus BW (GB/s)', 'Value': round(sum(oop_busbw_values) / len(oop_busbw_values), 2)},
+        {'Metric': 'Avg IP Bus BW (GB/s)', 'Value': round(sum(ip_busbw_values) / len(ip_busbw_values), 2)},
+        {'Metric': 'Total Errors', 'Value': sum(row['OOP_Errors'] + row['IP_Errors'] for row in data)}
+    ]
+    
+    if avg_bandwidth is not None:
+        summary_data.append({'Metric': 'NCCL Reported Avg Bus BW (GB/s)', 'Value': avg_bandwidth})
+    
+    return summary_data
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python nccl_to_excel.py <nccl_output_file>")
+        print("Example: python nccl_to_excel.py nccl-tests-container_3480.out")
+        sys.exit(1)
+    
+    input_file = sys.argv[1]
+    base_name = Path(input_file).stem
+    
+    print(f"Parsing NCCL output from: {input_file}")
+    
+    # Parse the NCCL output
+    data, avg_bandwidth = parse_nccl_output(input_file)
+    
+    if data is None:
+        sys.exit(1)
+    
+    print(f"Found {len(data)} performance data points")
+    if avg_bandwidth:
+        print(f"Average bus bandwidth: {avg_bandwidth} GB/s")
+    
+    # Create main results CSV file
+    results_file = f"{base_name}_results.csv"
+    if write_csv(data, results_file):
+        print(f"Results exported to: {results_file}")
+    else:
+        print("Error writing results file")
+        sys.exit(1)
+    
+    # Create summary CSV file
+    summary_data = create_summary_data(data, avg_bandwidth)
+    if summary_data:
+        summary_file = f"{base_name}_summary.csv"
+        if write_csv(summary_data, summary_file):
+            print(f"Summary exported to: {summary_file}")
+        else:
+            print("Error writing summary file")
+    
+    print("\nFiles created:")
+    print(f"- {results_file} (detailed performance data)")
+    print(f"- {summary_file} (summary statistics)")
+    print("\nYou can open these CSV files in Excel, LibreOffice Calc, or any spreadsheet application")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md
new file mode 100644
index 000000000..4393b200d
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/README.md
@@ -0,0 +1,337 @@
+# NCCL Tests
+
+[NCCL Tests](https://github.com/NVIDIA/nccl-tests) enable you to evaluate the performance of the network using the Nvidia Collective Communication Library. This test case contains a Docker file and scripts to submit NCCL tests on Slurm. Please refer to the relevant instructions below, depending on your environment.
+
+**This is a newer version of slurm tests with additional features**
+- Run in container mode or AMI mode 
+- Batch submission of multiple test combinations
+- Configurable test parameters in the script
+- Conversion of nccl test result summary to csv
+- Support for topology-aware scheduling
+
+## 0. Prepare the runtime environment
+
+### Slurm 
+If you are using Slurm, this guide assumes that you have the following:
+
+- A functional Slurm cluster on AWS.
+- Docker, [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) installed.
+- Enroot requires libmd to compile and squashfs-tools to execute.
+- A shared directory mounted on `/fsxl`
+
+It is recommended that you use the templates in the architectures [directory](../../../../1.architectures)
+
+## 1. Prepare the container image and other artifacts
+
+The NCCL tests are packaged in a container.
+
+> You can set versions and the branch for NCCL and EFA by editing the variables below in the Dockerfile.
+
+> | Variable              | Default     | Repository                                                                                  |
+> |-----------------------|-------------|---------------------------------------------------------------------------------------------|
+> |`CUDA_VERSION`         | `12.8.1`    |                                                                                             |
+> |`GDRCOPY_VERSION`      | `v2.5.1`    | [link](https://github.com/NVIDIA/gdrcopy)                                                   |
+> |`EFA_INSTALLER_VERSION`| `1.43.2`    | [link](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start.html#efa-start-enable) |
+> |`AWS_OFI_NCCL_VERSION` | `v1.16.3`   | [link](https://github.com/aws/aws-ofi-nccl)                                                 |
+> |`NCCL_VERSION`         | `v2.27.7-1` | [link](https://github.com/NVIDIA/nccl)                                                      |
+> |`NCCL_TESTS_VERSION`   | `v2.16.9`   | [link](https://github.com/NVIDIA/nccl-tests)                                                |
+
+You must pick each version of the library and set them as variables before proceed:
+
+```bash
+GDRCOPY_VERSION=v2.5.1
+EFA_INSTALLER_VERSION=1.43.2
+AWS_OFI_NCCL_VERSION=v1.16.3
+NCCL_VERSION=v2.27.7-1
+NCCL_TESTS_VERSION=v2.16.9
+TAG="efa${EFA_INSTALLER_VERSION}-ofi${AWS_OFI_NCCL_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}"
+CONTAINER_IMAGE_NAME_TAG="nccl-tests:${TAG}"
+```
+
+### Build the container
+
+If you wish to build the container image by yourself, follow this section. Alternatively, you can use a prebuilt image on a public ECR repository `public.ecr.aws/hpc-cloud/nccl-tests`. If you wish to do so, skip this section.
+
+1. Build the container image with the command below:
+   ```bash
+    #Navigate to the slurm directory:
+   cd micro-benchmarks/nccl-tests/slurm/
+
+   docker build -f nccl-tests.Dockerfile \
+          --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \
+          --build-arg="AWS_OFI_NCCL_VERSION=${AWS_OFI_NCCL_VERSION}" \
+          --build-arg="NCCL_VERSION=${NCCL_VERSION}" \
+          --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \
+          -t ${CONTAINER_IMAGE_NAME_TAG} \
+          .
+   
+   ```
+ 
+1. Once the container image is prepared, you can check if it is present with `docker images`. You should see an output similar to this one:
+   ```
+   REPOSITORY               TAG                        IMAGE ID       CREATED         SIZE
+   nccl                     latest                     6e981e5cf6a5   5 hours ago     8.61GB
+   ...
+   nvidia/cuda              12.8.1-devel-ubuntu22.04   a86c511c87e1   2 weeks ago     6.56GB
+   ```
+
+### Slurm
+
+To run the NCCL tests on Slurm, you will need to convert the container into a Squash file using Enroot.
+
+Convert the container image to a squash file via Enroot. If you have the built image locally use the following command:
+
+   ```bash
+   enroot import -o /fsxl/nccl-tests.sqsh dockerd://${CONTAINER_IMAGE_NAME_TAG}
+   ```
+
+If you want to pull the image from the public ECR use the following command:
+
+   ```bash
+   enroot import -o /fsxl/nccl.sqsh dockerd://public.ecr.aws/hpc-cloud/${CONTAINER_IMAGE_NAME_TAG}
+   ```
+
+The file will be stored in the `/fsxl` directory.
+
+## 2. Running the NCCL Tests
+
+### Slurm with container
+
+clone the awesome-distributed-training repo on your head node
+`git clone https://github.com/aws-samples/awesome-distributed-training.git`
+
+
+Navigate to the topology-aware-nccl-tests directory:
+```bash
+cd topology-aware-nccl-tests
+```
+
+
+### Supported Operations
+
+| Operation | Description |
+|-----------|-------------|
+| `allreduce` | Combines values from all ranks and distributes result to all ranks |
+| `allgather` | Gathers data from all ranks and distributes to all ranks |
+| `reducescatter` | Combines values and scatters results across ranks |
+| `alltoall` | Each rank sends different data to every other rank |
+| `gather` | Gathers data from all ranks to a single root rank |
+| `reduce` | Combines values from all ranks to a single root rank |
+| `scatter` | Scatters data from root rank to all other ranks |
+| `broadcast` | Broadcasts data from root rank to all other ranks |
+| `hypercube` | Hypercube communication pattern test |
+| `sendrecv` | Point-to-point send/receive operations |
+
+### Running multiple operations in parallel
+
+Here are the two common masks in use to partition the set of GPUs into smaller sets, each executing the same operation in parallel while measuring nccl performance.
+
+| Mask | Description | Use Case |
+|---------|-------------|----------|
+| `0x0` | All zeros | This is equivalent to NCCL_TESTS_SPLIT="AND 0x0" . This disables the gpu split: all GPUs participate together in a single operation, maximizing intra-group communication and measuring full payload bandwidth for the entire set. Use 0x0 to aggregate all GPUs, focusing on overall system communication performance|
+| `0x7` | Bit pattern 0111 | This is equivalent to NCCL_TESTS_SPLIT="AND 0x7" or NCCL_TESTS_SPLIT="MOD 8": On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network). Use this to split large clusters into many single-GPU groups for measuring individual inter-node or isolated bandwidths |
+
+Refer to [nccl-tests] (https://github.com/NVIDIA/nccl-tests?tab=readme-ov-file#running-multiple-operations-in-parallel) for more information 
+
+### Advanced Features
+
+#### Topology-Aware Scheduling
+Enable topology optimization by providing a sorted hostfile to mpirun:
+
+In November 2023, AWS announced the [Instance Topology API](https://aws.amazon.com/about-aws/whats-new/2023/11/instance-topology-api-ml-hpc-workloads/).
+It provides customers a unique per account hierarchical view of the relative proximity between Amazon EC2 instances.
+To learn more, please visit the [EC2 User Guide](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-topology.html).
+
+There are two way to use the topology API to maximize nccl performance.
+
+1. Minimize the number of switch levels that need to be crossed in a single job.  With Slurm, you can achieve this by using the slurm topology plugin. To enable topology plugin in slurm, refer to this repo [ec2-topology-aware-for-slurm](https://github.com/aws-samples/ec2-topology-aware-for-slurm?tab=readme-ov-file.) . Slurm will then attempt to allocate job resources based on topology.
+
+2. Once a job's resources are allocated, you want to have the NCCL communicator ranks organized so that (on average) communicator ranks that are close together are physically close together.  Slurm natively does not have a way to do this.  It assumes that hostnames are generated in a way that this is true based on a pure sorting of the hostnames in the job after (1) happens.  But because hostnames are not assigned based on topology in EC2, that doesn't work.  This is why you also should also pass the sorted hostfile when launching the mpirun job running your nccl test.
+
+Follow the steps below to generate a topologically sorted hostfile which you can pass to mpirun
+
+```bash
+# First Generate hostfile. Run this in on your cluster head node
+./generate_hostfile.sh
+# this will generate a file with of name hostnames.txt which contains all the nodes in your cluster
+
+# Second sort it by passing it to hostfile-topologify.py
+# Replace us-east-1 with your actual AWS region
+hostfile-topologify.py --input hostnames.txt --output topo_sorted_hostnames.txt --region us-east-1
+
+# Edit submit script to use topology file
+# Set TOPO_SORTED_FILE="topo_sorted_hostnames.txt" in submit_nccl_test_ami.sh
+```
+
+**hostfile-topologify.py Usage:**
+```bash
+# Basic usage with default region (us-east-1)
+hostfile-topologify.py --input hostnames.txt --output sorted_hostnames.txt
+
+# Specify custom AWS region
+hostfile-topologify.py --input hostnames.txt --output sorted_hostnames.txt --region ap-northeast-1
+
+# Output to stdout (default)
+hostfile-topologify.py --input hostnames.txt --region eu-west-1
+```
+
+**Parameters:**
+- `--input`: Input hostfile containing node hostnames (required)
+- `--output`: Output file for sorted hostnames (optional, defaults to stdout)
+- `--region`: AWS region where your cluster is deployed (optional, defaults to us-east-1)
+
+#### Container Mode 
+```bash
+# Single test defaults to 0x0 test split mask
+sbatch nccl-tests-container.sbatch allreduce
+
+#Run all supported collectives (allreduce, allgather,reducescatter, alltoall ) and test split mask (0x0, 0x7)
+./submit_nccl_test_container.sh
+```
+
+
+#### AMI Mode 
+```bash
+# Single test defaults to 0x0 test split mask
+sbatch nccl-tests-ami.sbatch allreduce
+
+#Run all supported collectives (allreduce, allgather,reducescatter, alltoall ) and test split masks (0x0, 0x7)
+./submit_nccl_test_ami.sh
+```
+
+#### Custom Parameters
+
+**Container Mode**: Modify `submit_nccl_test_container.sh`:
+```bash
+# Edit configuration variables in your script
+NODE_COUNTS=(8 16 32)  # Test different scales
+TEST_TYPES=("allreduce")  # Focus on specific operations
+SPLIT_MASKS=("0x0" "0x7")  # NCCL_TESTS_SPLIT_MASK value "Running multiple operations in parallel" for more info
+APPS_PATH="/fsxl"  # Container location
+```
+
+**AMI Mode**: Modify `submit_nccl_test_ami.sh`:
+```bash
+# Edit configuration variables
+NODE_COUNTS=(8 16 32)  # Test different scales
+TEST_TYPES=("allreduce")  # Focus on specific operations
+SPLIT_MASKS=("0x0" "0x7")  # NCCL_TESTS_SPLIT_MASK value "Running multiple operations in parallel" for more info
+TOPO_SORTED_FILE="topo_sorted_hostnames.txt" or pass in empty string "" if you dont have a topologically sorted hostfile # see section "Topology-Aware Scheduling" for more info
+```
+
+## 3. Result Processing and Analysis
+
+### Automated Result Processing
+
+The `process_nccl_results.sh` script provides automated result processing:
+**Features:**
+- Automatic detection of Container vs AMI output formats
+- CSV conversion with descriptive filenames
+- Topology-aware result naming (adds "_topo" suffix when topology sorting is used)
+- Comprehensive job status reporting
+- Organized result storage in `nccl_results/` directory
+
+
+```bash
+# Process results from container tests (manual job tracking)
+./process_nccl_results.sh your_job_ids_file.txt
+
+# Process results from your submit_nccl_test_ami.sh output
+./process_nccl_results.sh logs/submitted_jobs_ami_20250907_001101.txt
+```
+
+### Result File Naming Convention
+
+Generated csv files with nccl test results are automatically organized with descriptive names:
+```
+nccl_results/
+├── nccl_16_ami_allreduce_0x0_20250907_001101_results.csv
+├── nccl_16_ami_allreduce_0x0_topo_20250907_001101_results.csv  # With topology
+└── nccl_16_container_allgather_0x7_20250907_001101_results.csv
+```
+
+Format: `nccl_{nodes}_{container/ami}_{operation}_{pattern}[_topo]_{timestamp}_{type}.csv`
+
+### Performance Output Format
+
+NCCL tests output performance data from 8B to 17GB on p5en.48xlarge instances will be written to logs dir:
+
+```txt
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+           8             2     float     sum      -1    983.2    0.00    0.00      0    166.1    0.00    0.00      0
+          16             4     float     sum      -1    167.3    0.00    0.00      0    171.2    0.00    0.00      0
+        ...
+  17179869184    4294967296     float     sum      -1    92173  186.39  369.86      0    92284  186.16  369.42      0
+# Out of bounds values : 0 OK
+# Avg bus bandwidth    : 84.0569 
+```
+
+### Monitoring Jobs
+
+```bash
+# Monitor specific job output (container mode)
+tail -f logs/nccl-tests-container_<job_id>.out
+
+# View all submitted jobs 
+cat logs/submitted_jobs_ami_<timestamp>.txt
+
+```
+
+
+## 4. File Reference
+
+### Core Scripts
+
+| File | Purpose |
+|------|---------|
+| `slurm/v2/nccl-tests-ami.sbatch` | SLURM batch script for AMI-based execution |
+| `slurm/v2/nccl-tests-container.sbatch` | SLURM batch script for container-based execution |
+| `slurm/v2/submit_nccl_test_ami.sh` | Automated test suite submission script |
+| `slurm/v2/process_nccl_results.sh` | Automated result processing and CSV conversion |
+| `slurm/v2/generate_hostfile.sh` | Generate a file with all the hosts in a cluster |
+| `slurm/v2/hostfile-topologify.py` | Generate sorted hostfile for topology optimization (supports --region parameter) |
+
+### Output Directories
+
+| Directory | Contents |
+|-----------|----------|
+| `slurm/v2/logs/` | Job output files and submission tracking |
+| `slurm/v2/nccl_results/` | Processed CSV results and summaries |
+
+
+## 3. Understanding NCCL Bandwidth
+
+The NCCL tests reports metrics for the time to execute a given communication collective operation, the Algorithmic bandwidth and the bus bandwidth.
+
+The algorithm bandwidth is based on the following data_size / time where data_size is the size of the data being exchanged through the collective operation while time is the time taken by the operation. The bus bandwidth is generated using a formula specific to each collective operation to reflect the speed of inter-GPU communications. This metric can be used to compare to the hardware peak bandwidth “independently to the number of ranks used” (as shared here).
+
+| API           | Algbw                                              | Busbw                                    | Theoretical Max BW    | source                              |
+|---------------|----------------------------------------------------|------------------------------------------|-----------------------|-------------------------------------|
+| AllReduce     | baseBw = (count * typesize) / 1.0E9 / sec          | busBw = baseBw * (2*(nranks - 1)/nranks) | B = S/t * (2*(n-1)/n) | https://tinyurl.com/all-reduce      |
+| ReduceScatter | baseBw = (count * nranks * typesize) / 1.0E9 / sec | busBw = baseBw * ((nranks - 1)/nranks)   | B = S/t * (n-1)/n     | https://tinyurl.com/reduce-scatter  |
+| AllGather     | baseBw = (count * typesize) / 1.0E9 / sec          | busBw = baseBw * ((nranks - 1)/nranks)   | B = S/t * (n-1)/n     | https://tinyurl.com/all-gather      |
+| Broadcast     | baseBw = (count * typesize) / 1.0E9 / sec          | busBw = baseBw                           | B = S/t               | https://tinyurl.com/nccl-broadcast  |
+| Gather        | baseBw = (count * nranks * typesize) / 1.0E9 / sec | busBw = baseBw * ((nranks - 1)/nranks)   | B = S/t * (n-1)/n     | https://tinyurl.com/nccl-gather     |
+| Reduce        | baseBw = (count * typesize) / 1.0E9 / sec          | busBw = baseBw                           | B = S/t               | https://tinyurl.com/nccl-reduce     |
+| Scatter       | baseBw = (count * nranks * typesize) / 1.0E9 / sec | busBw = baseBw * ((nranks - 1)/nranks)   | B = S/t * (n-1)/n     | https://tinyurl.com/nccl-scatter    |
+| AlltoAll      | baseBw = (count * nranks * typesize) / 1.0E9 / sec | busBw = baseBw * ((nranks - 1)/nranks)   | B = S/t * (n-1)/n     | https://tinyurl.com/nccl-all-to-all |
+| SendRecv      | baseBw = (count * typesize) / 1.0E9 / sec          | busBw = baseBw                           | B = S/t               | https://tinyurl.com/sendrcv         |
+
+
+
+#### Notes for Algbw & Busbw**
+
+* `typesize` : size of the data type transferred in bytes (2 bytes for half-precision, 4 for single precision....).
+* `count` : number of elements transferred through the collective communication operation.
+* `nranks` : number of ranks participating to the collective communication operation.
+* `sec` : time in seconds to execute the collective communication operation.
+
+#### Notes for the Theoretical Max BW
+
+The formula defines the maximum theoretical bandwidth that can be achieved on different communication collectives in the ideal case.
+
+* `n` : number of ranks participating to the operation. (similar to nranks for Algbw and Busbw)
+* `t` : time to complete the operation. (similar to sec for Algbw and Busbw)
+* `S` : number of elements being communicated (similar to count for Algbw and Busbw)
+* `B` : theoretical peak bandwidth.
\ No newline at end of file
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh
new file mode 100755
index 000000000..f6e5b85eb
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/generate_hostfile.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+# Script to get hostnames
+set -e
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+hostname_file="hostnames.txt"
+
+sinfo -N -h -o "%N" | sort -u | tee "$hostname_file"
+
+
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile-topologify.py b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile-topologify.py
new file mode 100644
index 000000000..cf1e1b27f
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/hostfile-topologify.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# Take a hostfile (like one generated from the output of
+# ./list_compute_nodes in SimpleCluster or a file generated from
+# `/opt/slurm/bin/scontrol show hostname $SLURM_NODELIST`, and sort it
+# so that adjoining ranks are as close as possible in the network
+# topology.  Default is to print to stdout, although an output file
+# can be specified.
+
+import botocore
+import boto3
+import argparse
+import sys
+import socket
+import time
+
+# To avoid overwhelming the EC2 APIs with large requests, process only
+# pagination_count entries through the search loops at a time.
+pagination_count = 64
+
+
+def generate_topology_csv(input_file, output_file, region):
+    ec2_client = boto3.client('ec2', region)
+
+    done = False
+
+    network_to_hostname = {}
+
+    while not done:
+        hostname_to_ip = {}
+        ip_to_hostname = {}
+        instanceid_to_hostname = {}
+
+        # translate hostname to private ip, since PCluster uses custom
+        # hostnames that the EC2 control plane doesn't see.
+        for i in range(pagination_count):
+            hostname = input_file.readline()
+            if not hostname:
+                done = True
+                break
+            hostname = hostname.strip()
+
+            ip = None
+            for i in range(5):
+                try:
+                    ip = socket.gethostbyname(socket.getfqdn(hostname))
+                except:
+                    time.sleep(1)
+                else:
+                    break
+            if ip == None:
+                print("Error getting ip address for %s" % (hostname))
+                sys.exit(1)
+
+            hostname_to_ip[hostname] = ip
+            ip_to_hostname[ip] = hostname
+
+        if len(ip_to_hostname.keys()) == 0:
+            break
+
+        # build instanceid -> hostname map by describing all the ips
+        # and matching ip to instance id, then translating through
+        # hostname_to_ip.
+        #
+        # The network-interface.addresses filter happens *after*
+        # pagination, so we need to properly handle pagination here.
+        pagination_done = False
+        next_token = ""
+        while not pagination_done:
+            response = ec2_client.describe_instances(
+                Filters=[
+                    {
+                        'Name': 'network-interface.addresses.private-ip-address',
+                        'Values': list(ip_to_hostname.keys())
+                    }
+                ],
+                MaxResults=pagination_count,
+                NextToken=next_token)
+
+            if 'NextToken' in response:
+                next_token = response['NextToken']
+            else:
+                pagination_done = True
+
+            for reservation in response['Reservations']:
+                for instance in reservation['Instances']:
+                    instanceid = instance['InstanceId']
+                    for network_interface in instance['NetworkInterfaces']:
+                        private_ip = network_interface['PrivateIpAddress']
+                        if private_ip in ip_to_hostname:
+                            instanceid_to_hostname[instanceid] = ip_to_hostname[private_ip]
+
+        pagination_done = False
+        next_token = ""
+        while not pagination_done:
+            response = ec2_client.describe_instance_topology(
+                InstanceIds=list(instanceid_to_hostname.keys()),
+                NextToken=next_token)
+
+            if 'NextToken' in response:
+                next_token = response['NextToken']
+            else:
+                pagination_done = True
+
+            for instance in response['Instances']:
+                instanceid = instance['InstanceId']
+
+
+                t2_node = instance['NetworkNodes'][1]
+                t1_node = instance['NetworkNodes'][2]
+
+                if network_to_hostname.get(t2_node) == None:
+                    network_to_hostname[t2_node] = {}
+                if network_to_hostname[t2_node].get(t1_node) == None:
+                    network_to_hostname[t2_node][t1_node] = []
+                network_to_hostname[t2_node][t1_node].append(
+                    instanceid_to_hostname[instanceid])
+
+    for t2 in network_to_hostname:
+        for t1 in network_to_hostname[t2]:
+            for hostname in network_to_hostname[t2][t1]:
+                output_file.write("%s\n" % (hostname))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate placement information in CSV formation",
+    )
+    parser.add_argument(
+        "--output",
+        help="Output file to write (default: stdout)",
+        default=None
+    )
+    parser.add_argument(
+        "--input",
+        help="input hostfile",
+        required=True,
+        default=None
+    )
+    parser.add_argument(
+        "--region",
+        help="AWS region (default: us-east-1)",
+        default="us-east-1"
+    )
+
+    args = parser.parse_args()
+
+    if args.output != None:
+        output_file_handle = open(args.output, "w")
+    else:
+        output_file_handle = sys.stdout
+
+    input_file_handle = open(args.input, "r")
+
+    generate_topology_csv(input_file_handle, output_file_handle, args.region)
+
+    input_file_handle.close()
+    if args.output != None:
+        output_file_handle.close()
\ No newline at end of file
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch
new file mode 100644
index 000000000..a301c12e4
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-ami.sbatch
@@ -0,0 +1,146 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=nccl-tests-ami # name of your job
+#SBATCH --nodes=2 # number of nodes to use, 
+#SBATCH --ntasks-per-node 8 # Number of GPU per node (e.g 8 H200 for p5en.48xlarge)
+#SBATCH --output logs/%x_%j.out
+#SBATCH --error logs/%x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+set -ex
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+### Disable hyperthreading by setting the tasks per core to 1
+#SBATCH --ntasks-per-core=1
+
+# This script is designed to run by default on the Deep Learning AMI, Ubuntu 20.04
+# See https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-20-04/
+
+# Supported NCCL collective operations:
+# - allreduce     : AllReduce collective (default)
+# - allgather     : AllGather collective  
+# - reducescatter : ReduceScatter collective
+# - alltoall      : AllToAll collective
+# - gather        : Gather collective
+# - reduce        : Reduce collective
+# - scatter       : Scatter collective
+# - broadcast     : Broadcast collective
+# - hypercube     : Hypercube collective
+# - sendrecv      : SendRecv point-to-point
+
+TEST_TYPE=${1:-allreduce}
+ADDITIONAL_LD_LIBRARY_PATH=${2:-/usr/local/cuda-12.9/lib64}
+SPLIT_MASK=${3:-0x0}
+TOPO_SORTED_FILE=${4:-""}
+ENABLE_NCCL_DEBUG=${5:-false}
+
+# Set binary path based on test type
+CUDA_TEST_DIR="/opt/nccl-tests/build"
+case ${TEST_TYPE} in
+    allreduce)
+        TEST_BINARY="${CUDA_TEST_DIR}/all_reduce_perf"
+        ;;
+    allgather)
+        TEST_BINARY="${CUDA_TEST_DIR}/all_gather_perf"
+        ;;
+    reducescatter)
+        TEST_BINARY="${CUDA_TEST_DIR}/reduce_scatter_perf"
+        ;;
+    alltoall)
+        TEST_BINARY="${CUDA_TEST_DIR}/alltoall_perf"
+        ;;
+    gather)
+        TEST_BINARY="${CUDA_TEST_DIR}/gather_perf"
+        ;;
+    reduce)
+        TEST_BINARY="${CUDA_TEST_DIR}/reduce_perf"
+        ;;
+    scatter)
+        TEST_BINARY="${CUDA_TEST_DIR}/scatter_perf"
+        ;;
+    broadcast)
+        TEST_BINARY="${CUDA_TEST_DIR}/broadcast_perf"
+        ;;
+    hypercube)
+        TEST_BINARY="${CUDA_TEST_DIR}/hypercube_perf"
+        ;;
+    sendrecv)
+        TEST_BINARY="${CUDA_TEST_DIR}/sendrecv_perf"
+        ;;
+    *)
+        echo "Error: Unsupported test type '${TEST_TYPE}'"
+        echo "Supported types: allreduce, allgather, reducescatter, alltoall, gather, reduce, scatter, broadcast, hypercube, sendrecv"
+        exit 1
+        ;;
+esac
+
+echo "Running NCCL ${TEST_TYPE} test in ami with split mask ${SPLIT_MASK}"
+echo "$(date '+%Y-%m-%d %H:%M:%S') - Starting NCCL ${TEST_TYPE} test"
+
+
+# Get Hostname to Instance ID mapping
+mpirun -N 1 bash -c '
+  HOSTNAME=$(hostname)
+  ASSET_TAG=$(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")
+  NCCL_VER=$(strings /opt/nccl/build/lib/libnccl.so | grep "NCCL version" | head -1 | awk "{print \$3}" 2>/dev/null || echo "N/A")
+  AWS_OFI_NCCL_VER=$(strings /opt/amazon/ofi-nccl/lib/libnccl-net.so | grep "NET/OFI Initializing aws-ofi-nccl" | awk "{print \$4}" 2>/dev/null || echo "N/A")
+  EFA_VER=$(grep -E "(version|installer)" /opt/amazon/efa_installed_packages 2>/dev/null | head -1 || echo "N/A")
+  echo "hostname=$HOSTNAME ➡️ $ASSET_TAG | NCCL: $NCCL_VER | AWS-OFI-NCCL: $AWS_OFI_NCCL_VER | EFA: $EFA_VER"
+'
+# In the echo statement above , don't remove the string hostname=, it is used in converting the nccl output to csv in process_nccl_results.sh
+
+
+time_stamp=$(date +%Y%m%d_%H%M%S)
+
+# Expand the compact node list into a full list of hostnames,
+# and save it to a file.
+scontrol show hostnames $SLURM_JOB_NODELIST > /tmp/nccl_test_ami_slurm_hostfile_${time_stamp}.txt
+
+
+# Set up hostfile options based on topology sorting
+if [ -n "$TOPO_SORTED_FILE" ]; then
+    HOSTFILE="/tmp/nccl_test_ami_seq_hostfile_${time_stamp}.txt"
+    rm -f "$HOSTFILE"
+
+    # Filter TOPO_SORTED_FILE to only include hosts that exist in the slurm hostfile
+    # and repeat each hostname SLURM_NTASKS_PER_NODE times for processes per node
+    while read -r hostname; do
+        if grep -q "^${hostname}$" /tmp/nccl_test_ami_slurm_hostfile_${time_stamp}.txt; then
+            for i in $(seq 1 $SLURM_NTASKS_PER_NODE); do
+                echo "$hostname" >> "$HOSTFILE"
+            done
+        fi
+    done < $TOPO_SORTED_FILE
+
+    echo "Created sequential hostfile with repeated hostnames (filtered to match SLURM allocation):"
+    echo "Total lines: $(wc -l < "$HOSTFILE")"
+    
+    HOSTFILE_OPTS="--hostfile $HOSTFILE --mca rmaps seq"
+else
+    HOSTFILE_OPTS=""
+fi
+
+
+
+# Set NCCL debug flag conditionally
+if [ "$ENABLE_NCCL_DEBUG" = "true" ]; then
+    NCCL_DEBUG_FLAG="-x NCCL_DEBUG=INFO"
+else
+    NCCL_DEBUG_FLAG=""
+fi
+
+mpirun -n $((SLURM_NTASKS_PER_NODE * SLURM_JOB_NUM_NODES)) -N $SLURM_NTASKS_PER_NODE \
+        -x LD_LIBRARY_PATH=/opt/nccl/build/lib:$ADDITIONAL_LD_LIBRARY_PATH:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/amazon/ofi-nccl/lib:/usr/local/lib:/usr/lib:$LD_LIBRARY_PATH \
+        -x NCCL_SOCKET_IFNAME=^docker,lo,veth \
+        -x NCCL_TESTS_SPLIT_MASK=${SPLIT_MASK} \
+        ${NCCL_DEBUG_FLAG} \
+        ${HOSTFILE_OPTS} \
+        --mca pml ^ucx \
+        --mca btl tcp,self \
+        --mca btl_tcp_if_exclude lo,docker0,veth_def_agent \
+        --bind-to none ${TEST_BINARY} -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch
new file mode 100644
index 000000000..8ef76a5ff
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/nccl-tests-container.sbatch
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+#SBATCH --job-name=nccl-tests-container # name of your job
+#SBATCH --nodes=16 # number of nodes to use, 
+#SBATCH --ntasks-per-node 8 # Number of GPU per node (e.g 8 H200 for p5en.48xlarge)
+###SBATCH --gpus-per-node=8 # number of GPU we reserve. Uncomment for AWS ParallelCluster
+#SBATCH --output logs/%x_%j.out
+#SBATCH --error logs/%x_%j.err
+#SBATCH --exclusive
+#SBATCH --wait-all-nodes=1
+
+### Disable hyperthreading by setting the tasks per core to 1
+#SBATCH --ntasks-per-core=1
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+###########################
+###### User Variables #####
+###########################
+
+# Supported NCCL collective operations:
+# - allreduce     : AllReduce collective (default)
+# - allgather     : AllGather collective  
+# - reducescatter : ReduceScatter collective
+# - alltoall      : AllToAll collective
+# - gather        : Gather collective
+# - reduce        : Reduce collective
+# - scatter       : Scatter collective
+# - broadcast     : Broadcast collective
+# - hypercube     : Hypercube collective
+# - sendrecv      : SendRecv point-to-point
+
+TEST_TYPE=${1:-allreduce}
+APPS_PATH=${2:-/fsx}
+SPLIT_MASK=${3:-0x0}
+ENABLE_NCCL_DEBUG=${4:-false}
+
+# default variables for Enroot
+: "${NCCL_TESTS_PATH:=/opt/nccl-tests/build}"
+: "${IMAGE:=$APPS_PATH/nccl-tests.sqsh}"
+
+# Set binary path based on test type
+case ${TEST_TYPE} in
+    allreduce)
+        TEST_BINARY="${NCCL_TESTS_PATH}/all_reduce_perf"
+        ;;
+    allgather)
+        TEST_BINARY="${NCCL_TESTS_PATH}/all_gather_perf"
+        ;;
+    reducescatter)
+        TEST_BINARY="${NCCL_TESTS_PATH}/reduce_scatter_perf"
+        ;;
+    alltoall)
+        TEST_BINARY="${NCCL_TESTS_PATH}/alltoall_perf"
+        ;;
+    gather)
+        TEST_BINARY="${NCCL_TESTS_PATH}/gather_perf"
+        ;;
+    reduce)
+        TEST_BINARY="${NCCL_TESTS_PATH}/reduce_perf"
+        ;;
+    scatter)
+        TEST_BINARY="${NCCL_TESTS_PATH}/scatter_perf"
+        ;;
+    broadcast)
+        TEST_BINARY="${NCCL_TESTS_PATH}/broadcast_perf"
+        ;;
+    hypercube)
+        TEST_BINARY="${NCCL_TESTS_PATH}/hypercube_perf"
+        ;;
+    sendrecv)
+        TEST_BINARY="${NCCL_TESTS_PATH}/sendrecv_perf"
+        ;;
+    *)
+        echo "Error: Unsupported test type '${TEST_TYPE}'"
+        echo "Supported types: allreduce, allgather, reducescatter, alltoall, gather, reduce, scatter, broadcast, hypercube, sendrecv"
+        exit 1
+        ;;
+esac
+
+echo "Running NCCL ${TEST_TYPE} test in container with split mask ${SPLIT_MASK}"
+echo "$(date '+%Y-%m-%d %H:%M:%S') - Starting NCCL ${TEST_TYPE} test"
+
+export NCCL_TESTS_SPLIT_MASK=${SPLIT_MASK}
+
+## Set this flag for debugging EFA
+#export FI_LOG_LEVEL=warn
+
+## NCCL Environment variables
+if [[ "${ENABLE_NCCL_DEBUG}" == "true" ]]; then
+    export NCCL_DEBUG=INFO
+    echo "NCCL debug enabled"
+fi
+
+declare -a ARGS=(
+    --container-image $IMAGE
+)
+
+#Get Hostname and Instance IDs
+mpirun -N 1 bash -c '
+  HOSTNAME=$(hostname)
+  ASSET_TAG=$(cat /sys/devices/virtual/dmi/id/board_asset_tag | tr -d " ")
+  NCCL_VER=$(strings /opt/nccl/build/lib/libnccl.so | grep "NCCL version" | head -1 | awk "{print \$3}" 2>/dev/null || echo "N/A")
+  AWS_OFI_NCCL_VER=$(strings /opt/amazon/ofi-nccl/lib/libnccl-net.so | grep "NET/OFI Initializing aws-ofi-nccl" | awk "{print \$4}" 2>/dev/null || echo "N/A")
+  EFA_VER=$(grep -E "(version|installer)" /opt/amazon/efa_installed_packages 2>/dev/null | head -1 || echo "N/A")
+  echo "hostname=$HOSTNAME ➡️ $ASSET_TAG | NCCL: $NCCL_VER | OFI-NCCL: $OFI_NCCL_VER | AWS-OFI-NCCL: $AWS_OFI_NCCL_VER | EFA: $EFA_VER"
+'
+# In the echo statement above , don't remove the string hostname=, it is used in converting the nccl output to csv in process_nccl_results.sh
+
+
+# Run NCCL test with configurable split masks
+srun "${ARGS[@]}" --mpi=pmix --cpu-bind=none $TEST_BINARY -b 8 -e 16G -f 2 -g 1 -c 1 -n 100
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh
new file mode 100755
index 000000000..d90c5d488
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/process_nccl_results.sh
@@ -0,0 +1,275 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+# Script to convert nccl test outputs to Excel after they complete
+# Usage: ./process_nccl_results.sh <submitted_jobs_file.txt>
+# Example: ./process_nccl_results.sh submitted_jobs_20250905_052718.txt
+
+set -e
+
+# Check arguments
+if [ $# -ne 1 ]; then
+    echo "Usage: $0 <submitted_jobs_file.txt>"
+    echo "Example: $0 submitted_jobs_20250905_052718.txt"
+    echo ""
+    echo "Available job files:"
+    ls -1 submitted_jobs_*.txt 2>/dev/null || echo "  No submitted_jobs_*.txt files found"
+    exit 1
+fi
+
+JOBS_FILE="$1"
+
+# Validate input file
+if [[ ! -f "$JOBS_FILE" ]]; then
+    echo "Error: Job file '$JOBS_FILE' not found"
+    exit 1
+fi
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+# Configuration
+RESULTS_DIR="nccl_results"
+CSV_CONVERTER="../../nccl_to_csv.py"
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+
+# Create results directory if it doesn't exist
+mkdir -p "$RESULTS_DIR"
+
+# Read job IDs from file
+mapfile -t JOB_IDS < "$JOBS_FILE"
+
+echo -e "${GREEN}NCCL Results Processor${NC}"
+echo -e "${BLUE}Processing jobs from: $JOBS_FILE${NC}"
+echo -e "${BLUE}Job IDs to monitor: ${JOB_IDS[*]}${NC}"
+echo ""
+
+# Function to extract job parameters from output file content only
+parse_job_details() {
+    local output_file=$1
+    local nodes test_type data_pattern topo_suffix run_type
+    
+    if [[ ! -f "$output_file" ]]; then
+        echo "unknown_unknown_unknown_unknown"
+        return
+    fi
+    
+    # Determine run type (AMI or container) from filename or output content
+    if [[ "$output_file" == *"ami"* ]]; then
+        run_type="ami"
+    elif [[ "$output_file" == *"container"* ]]; then
+        run_type="container"
+    elif grep -q "Running NCCL.*test in ami" "$output_file"; then
+        run_type="ami"
+    elif grep -q "Running NCCL.*test in container" "$output_file"; then
+        run_type="container"
+    else
+        run_type="unknown"
+    fi
+    
+    # Extract test type from output - handles both AMI and container formats
+    if grep -q "Running NCCL.*test" "$output_file"; then
+        # Handle both "Running NCCL allreduce test in ami" and "Running NCCL allreduce test in container"
+        test_type=$(grep "Running NCCL.*test" "$output_file" | sed -n 's/.*Running NCCL \([a-z]*\) test.*/\1/p' | head -1)
+    fi
+    
+    # Extract split mask from output - handles both AMI and container formats
+    if grep -q "split mask" "$output_file"; then
+        data_pattern=$(grep "split mask" "$output_file" | sed -n 's/.*split mask \(0x[0-9a-fA-F]*\).*/\1/p' | head -1)
+    fi
+    
+    # Count unique hostnames to determine nodes
+    # Extract hostnames from SLURM output - look for patterns like "hostname: " or "Running on hostname"
+    if grep -q "hostname=" "$output_file"; then
+        # Extract hostnames from the hostname= pattern added to sbatch files
+        nodes=$(grep -oE "hostname=[a-zA-Z0-9.-]+" "$output_file" | \
+                sed 's/hostname=//' | \
+                sort -u | wc -l)
+    fi
+    
+    # Check if topology sorting was used
+    if grep -q "Created sequential hostfile with repeated hostnames" "$output_file"; then
+        topo_suffix="_topo"
+    else
+        topo_suffix=""
+    fi
+    
+    echo "${nodes:-unknown}_${run_type}_${test_type:-unknown}_${data_pattern:-unknown}${topo_suffix}"
+}
+
+# Function to convert output to CSV
+convert_to_csv() {
+    local output_file=$1
+    local job_details=$2
+    
+    echo -e "${YELLOW}Converting $output_file to CSV...${NC}"
+    
+    # Check if converter exists
+    if [[ ! -f "$CSV_CONVERTER" ]]; then
+        echo -e "${RED}Error: CSV converter not found at $CSV_CONVERTER${NC}"
+        return 1
+    fi
+    
+    # Run converter
+    if python3 "$CSV_CONVERTER" "$output_file"; then
+        # Move generated files to results directory with descriptive names
+        local base_name=$(basename "$output_file" .out)
+        local moved_files=0
+        
+        # Handle results CSV file
+        if [[ -f "${base_name}_results.csv" ]]; then
+            mv "${base_name}_results.csv" "$RESULTS_DIR/nccl_${job_details}_${TIMESTAMP}_results.csv"
+            echo -e "${GREEN}  → Results: $RESULTS_DIR/nccl_${job_details}_${TIMESTAMP}_results.csv${NC}"
+            moved_files=$((moved_files + 1))
+        fi
+        
+
+        
+        # Clean up any remaining summary CSV files that match the pattern
+        for leftover_file in "${base_name}"*summary*.csv "${base_name}"*_summary.csv; do
+            if [[ -f "$leftover_file" ]]; then
+                echo -e "${YELLOW}  → Cleaning up leftover summary file: $leftover_file${NC}"
+                rm -f "$leftover_file"
+            fi
+        done
+        
+        if [[ $moved_files -gt 0 ]]; then
+            return 0
+        else
+            echo -e "${RED}  → No CSV files were generated or found${NC}"
+            return 1
+        fi
+    else
+        echo -e "${RED}  → Conversion failed${NC}"
+        return 1
+    fi
+}
+
+# Function to check if output file has performance data
+has_performance_data() {
+    local output_file=$1
+    
+    if [[ ! -f "$output_file" ]]; then
+        return 1
+    fi
+    
+    # Check for NCCL performance table
+    if grep -q "out-of-place.*in-place" "$output_file" && \
+       grep -q "size.*count.*type.*redop" "$output_file" && \
+       grep -q "Avg bus bandwidth" "$output_file"; then
+        return 0
+    fi
+    
+    return 1
+}
+
+# Removed job status checking - assuming all jobs are complete
+
+# Function to get expected output filename for job ID
+get_output_filename() {
+    local job_id=$1
+    
+    # Check for both AMI and container output file patterns in logs/ directory first
+    if [[ -f "logs/nccl-tests-ami_${job_id}.out" ]]; then
+        echo "logs/nccl-tests-ami_${job_id}.out"
+    elif [[ -f "logs/nccl-tests-container_${job_id}.out" ]]; then
+        echo "logs/nccl-tests-container_${job_id}.out"
+    elif [[ -f "nccl-tests-ami_${job_id}.out" ]]; then
+        echo "nccl-tests-ami_${job_id}.out"
+    elif [[ -f "nccl-tests-container_${job_id}.out" ]]; then
+        echo "nccl-tests-container_${job_id}.out"
+    else
+        # Default to logs/container pattern for backwards compatibility
+        echo "logs/nccl-tests-container_${job_id}.out"
+    fi
+}
+
+# Main monitoring loop
+processed_files=()
+completed_jobs=()
+failed_jobs=()
+
+echo -e "${BLUE}Processing ${#JOB_IDS[@]} completed jobs...${NC}"
+echo -e "${BLUE}Timestamp for this run: ${TIMESTAMP}${NC}"
+echo ""
+
+# Process all jobs assuming they are complete
+for job_id in "${JOB_IDS[@]}"; do
+    output_file=$(get_output_filename "$job_id")
+    
+    echo -e "${YELLOW}Processing job $job_id...${NC}"
+    
+    if [[ -f "$output_file" ]] && has_performance_data "$output_file"; then
+        job_details=$(parse_job_details "$output_file")
+        echo -e "${BLUE}  → Job details: $job_details${NC}"
+        
+        if convert_to_csv "$output_file" "$job_details"; then
+            processed_files+=("$output_file")
+            completed_jobs+=("$job_id")
+            echo -e "${GREEN}  → Successfully processed job $job_id${NC}"
+        else
+            failed_jobs+=("$job_id")
+            echo -e "${RED}  → Processing failed for job $job_id${NC}"
+        fi
+    else
+        echo -e "${YELLOW}  → Output file missing or incomplete for job $job_id${NC}"
+        failed_jobs+=("$job_id")
+    fi
+    echo ""
+done
+
+echo ""
+echo -e "${GREEN}Processing complete!${NC}"
+echo -e "${BLUE}Results saved in: $RESULTS_DIR/${NC}"
+
+echo ""
+echo -e "${GREEN}Summary:${NC}"
+echo "  Successfully processed: ${#completed_jobs[@]} jobs"
+echo "  Failed/Missing: ${#failed_jobs[@]} jobs"
+echo "  Total jobs: ${#JOB_IDS[@]}"
+
+if [[ ${#completed_jobs[@]} -gt 0 ]]; then
+    echo ""
+    echo -e "${GREEN}Successfully processed jobs:${NC}"
+    for job_id in "${completed_jobs[@]}"; do
+        echo "  - Job $job_id"
+    done
+fi
+
+if [[ ${#failed_jobs[@]} -gt 0 ]]; then
+    echo ""
+    echo -e "${RED}Failed/Missing jobs:${NC}"
+    for job_id in "${failed_jobs[@]}"; do
+        echo "  - Job $job_id"
+    done
+fi
+
+if [[ ${#processed_files[@]} -gt 0 ]]; then
+    echo ""
+    echo -e "${BLUE}Generated CSV files:${NC}"
+    ls -la "$RESULTS_DIR"/*.csv 2>/dev/null || echo "No CSV files found"
+else
+    echo -e "${YELLOW}No CSV files were generated${NC}"
+fi
+
+# Final cleanup: remove any remaining CSV files in current directory
+echo ""
+echo -e "${YELLOW}Performing final cleanup...${NC}"
+cleanup_count=0
+for leftover_file in nccl-tests-*_*.csv; do
+    if [[ -f "$leftover_file" ]]; then
+        echo -e "${YELLOW}  → Removing leftover file: $leftover_file${NC}"
+        rm -f "$leftover_file"
+        cleanup_count=$((cleanup_count + 1))
+    fi
+done
+
+if [[ $cleanup_count -gt 0 ]]; then
+    echo -e "${GREEN}Cleaned up $cleanup_count leftover CSV files${NC}"
+else
+    echo -e "${GREEN}No leftover files to clean up${NC}"
+fi
\ No newline at end of file
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh
new file mode 100755
index 000000000..ff9c5fc5a
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_ami.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+# Script to submit comprehensive NCCL tests with AMI-based jobs
+# Tests all collective operations with different Split mask
+
+set -e
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Configuration
+NODE_COUNTS=(2 4 8 16)
+# NODE_COUNTS=(16)
+ADDITIONAL_LD_LIBRARY_PATH="/usr/local/cuda-12.9/lib64:/opt/nccl/build/lib/:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/"
+TEST_TYPES=("allreduce" "allgather" "reducescatter" "alltoall")
+# TEST_TYPES=("allreduce")
+SPLIT_MASK=("0x0" "0x7")
+# SPLIT_MASK=("0x0")
+TOPO_SORTED_FILE="topo_sorted_hostnames.txt"
+# TOPO_SORTED_FILE=""
+ENABLE_NCCL_DEBUG="false"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Starting comprehensive NCCL test submission (AMI version)...${NC}"
+echo -e "${BLUE}Configuration:${NC}"
+echo "  Node counts: ${NODE_COUNTS[*]}"
+echo "  LD Library path: $ADDITIONAL_LD_LIBRARY_PATH"
+echo "  Test types: ${TEST_TYPES[*]}"
+echo "  Split mask: ${SPLIT_MASK[*]}"
+echo "  NCCL Debug: $ENABLE_NCCL_DEBUG"
+echo ""
+
+# Counter for submitted jobs
+job_count=0
+submitted_jobs=()
+
+# Create job tracking files
+timestamp=$(date +"%Y%m%d_%H%M%S")
+job_ids_file="logs/submitted_jobs_ami_${timestamp}.txt"
+job_details_file="logs/job_details_ami_${timestamp}.csv"
+
+# Initialize CSV file with headers
+echo "JobID,Nodes,TestType,SplitMask,TotalGPUs,SubmissionTime" > "$job_details_file"
+
+# Submit all test combinations
+for nodes in "${NODE_COUNTS[@]}"; do
+    total_gpus=$((nodes * 8))
+    
+    echo -e "${YELLOW}=== Submitting AMI tests for $nodes nodes ($total_gpus GPUs) ===${NC}"
+    
+    for test_type in "${TEST_TYPES[@]}"; do
+        for split_mask in "${SPLIT_MASK[@]}"; do
+            echo "Submitting: $test_type with pattern $split_mask on $nodes nodes"
+            
+            # Submit the job and capture job ID
+            job_output=$(sbatch --nodes=$nodes nccl-tests-ami.sbatch "$test_type" "$ADDITIONAL_LD_LIBRARY_PATH" "$split_mask" "$TOPO_SORTED_FILE" "$ENABLE_NCCL_DEBUG")
+            job_id=$(echo "$job_output" | grep -o '[0-9]\+')
+            
+            if [ -n "$job_id" ]; then
+                submitted_jobs+=("$job_id")
+                job_count=$((job_count + 1))
+                echo "  → Job ID: $job_id"
+                
+                # Save job ID to file
+                echo "$job_id" >> "$job_ids_file"
+                
+                # Save job details to CSV
+                submission_time=$(date +"%Y-%m-%d %H:%M:%S")
+                echo "$job_id,$nodes,$test_type,$split_mask,$total_gpus,$submission_time" >> "$job_details_file"
+                echo "tail -f logs/nccl-tests-ami_$job_id.out"
+            else
+                echo "  → Error: Failed to get job ID"
+            fi
+            
+            # Small delay to avoid overwhelming the scheduler
+            sleep 1
+        done
+    done
+    echo ""
+done
+
+echo -e "${GREEN}Summary:${NC}"
+echo "Total jobs submitted: $job_count"
+echo "Job IDs: ${submitted_jobs[*]}"
+echo ""
+
+# Save summary information
+echo -e "${BLUE}Job tracking files created:${NC}"
+echo "  Job IDs: $job_ids_file"
+echo "  Job details: $job_details_file"
+echo ""
+
+# Show queue status
+echo -e "${YELLOW}Current queue status:${NC}"
+squeue -u $USER
+
+echo ""
+echo -e "${GREEN}All jobs submitted successfully!${NC}"
+echo -e "${BLUE}Monitor progress with: squeue -u $USER${NC}"
+echo -e "${BLUE}Check job details with: scontrol show job <job_id>${NC}"
+echo -e "${BLUE}Monitor specific jobs: squeue -j $(IFS=,; echo "${submitted_jobs[*]}")${NC}"
+echo ""
+echo -e "${YELLOW}To automatically process results as jobs complete, run:${NC}"
+echo -e "${BLUE}./process_nccl_results.sh $job_ids_file${NC}"
+echo ""
+echo -e "${YELLOW}To cancel all submitted jobs if needed:${NC}"
+echo -e "${BLUE}scancel $(IFS=' '; echo "${submitted_jobs[*]}")${NC}"
diff --git a/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh
new file mode 100755
index 000000000..956df5869
--- /dev/null
+++ b/micro-benchmarks/nccl-tests/slurm/topology-aware-nccl-tests/submit_nccl_test_container.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+# Script to submit comprehensive NCCL tests with 2 and 4 nodes
+# Tests all collective operations with different split masks
+
+set -e
+
+# Configuration
+NODE_COUNTS=(2 4 8 16)
+APPS_PATH="/fsxl"
+TEST_TYPES=("allreduce" "allgather" "reducescatter" "alltoall")
+# TEST_TYPES=("allreduce" )
+SPLIT_MASK=("0x0" "0x7")
+# SPLIT_MASK=("0x0")
+ENABLE_NCCL_DEBUG="false"
+
+# Colors for output
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${GREEN}Starting comprehensive NCCL test submission...${NC}"
+echo -e "${BLUE}Configuration:${NC}"
+echo "  Node counts: ${NODE_COUNTS[*]}"
+echo "  Apps path: $APPS_PATH"
+echo "  Test types: ${TEST_TYPES[*]}"
+echo "  split masks: ${SPLIT_MASK[*]}"
+echo "  NCCL Debug: $ENABLE_NCCL_DEBUG"
+echo ""
+
+# Counter for submitted jobs
+job_count=0
+submitted_jobs=()
+
+# Create job tracking files
+timestamp=$(date +"%Y%m%d_%H%M%S")
+job_ids_file="logs/submitted_jobs_${timestamp}.txt"
+job_details_file="logs/job_details_${timestamp}.csv"
+
+# Initialize CSV file with headers
+echo "JobID,Nodes,TestType,SplitMask,TotalGPUs,SubmissionTime" > "$job_details_file"
+
+# Submit all test combinations
+for nodes in "${NODE_COUNTS[@]}"; do
+    total_gpus=$((nodes * 8))
+    
+    echo -e "${YELLOW}=== Submitting tests for $nodes nodes ($total_gpus GPUs) ===${NC}"
+    
+    for test_type in "${TEST_TYPES[@]}"; do
+        for split_mask in "${SPLIT_MASK[@]}"; do
+            echo "Submitting: $test_type with pattern $split_mask on $nodes nodes"
+            
+            # Submit the job and capture job ID
+            job_output=$(sbatch --nodes=$nodes nccl-tests-container.sbatch "$test_type" "$APPS_PATH" "$split_mask" "$ENABLE_NCCL_DEBUG")
+            job_id=$(echo "$job_output" | grep -o '[0-9]\+')
+            
+            if [ -n "$job_id" ]; then
+                submitted_jobs+=("$job_id")
+                job_count=$((job_count + 1))
+                echo "  → Job ID: $job_id"
+                
+                # Save job ID to file
+                echo "$job_id" >> "$job_ids_file"
+                
+                # Save job details to CSV
+                submission_time=$(date +"%Y-%m-%d %H:%M:%S")
+                echo "$job_id,$nodes,$test_type,$split_mask,$total_gpus,$submission_time" >> "$job_details_file"
+            else
+                echo "  → Error: Failed to get job ID"
+            fi
+            
+            # Small delay to avoid overwhelming the scheduler
+            sleep 1
+        done
+    done
+    echo ""
+done
+
+echo -e "${GREEN}Summary:${NC}"
+echo "Total jobs submitted: $job_count"
+echo "Job IDs: ${submitted_jobs[*]}"
+echo ""
+
+# Save summary information
+echo -e "${BLUE}Job tracking files created:${NC}"
+echo "  Job IDs: $job_ids_file"
+echo "  Job details: $job_details_file"
+echo ""
+
+# Show queue status
+echo -e "${YELLOW}Current queue status:${NC}"
+squeue -u $USER
+
+echo ""
+echo -e "${GREEN}All jobs submitted successfully!${NC}"
+echo -e "${BLUE}Monitor progress with: squeue -u $USER${NC}"
+echo -e "${BLUE}Check job details with: scontrol show job <job_id>${NC}"
+echo -e "${BLUE}Monitor specific jobs: squeue -j $(IFS=,; echo "${submitted_jobs[*]}")${NC}"
+echo ""
+echo -e "${YELLOW}To automatically process results as jobs complete, run:${NC}"
+echo -e "${BLUE}./process_nccl_results.sh $job_ids_file${NC}"
+echo ""
+echo -e "${YELLOW}To cancel all submitted jobs if needed:${NC}"
+echo -e "${BLUE}scancel $(IFS=' '; echo "${submitted_jobs[*]}")${NC}"
\ No newline at end of file