ROCm
diff --git a/‎examples/ops/dispatch_combine/test_dispatch_combine_internode.py‎
Lines changed: 149 additions & 31 deletions b/‎examples/ops/dispatch_combine/test_dispatch_combine_internode.py‎
Lines changed: 149 additions & 31 deletions
diff --git a/‎include/mori/ops/dispatch_combine/dispatch_combine.hpp‎
Lines changed: 1 addition & 0 deletions b/‎include/mori/ops/dispatch_combine/dispatch_combine.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/mori/utils/hip_helper.hpp‎
Lines changed: 38 additions & 0 deletions b/‎include/mori/utils/hip_helper.hpp‎
Lines changed: 38 additions & 0 deletions
@@ -58,7 +58,7 @@ def __init__(
             hidden_dim=7168,
             scale_dim=32,
             scale_type_size=4,
-            max_num_inp_token_per_rank=max_tokens,
+            max_num_inp_token_per_rank=(max_tokens + 63) // 64 * 64,
             num_experts_per_rank=16,
             num_experts_per_token=8,
             warp_num_per_block=8,
@@ -129,16 +129,16 @@ def _allgather_with_token_num_padding(self, input, max_token_num):
         dist.all_gather(output, padded_input)
         return output
 
-    def gen_test_data(self, use_max_token_num=False):
+    def gen_test_data(self, max_num_token, use_max_token_num=False):
         # gen num_tokens
         if use_max_token_num:
             num_token = torch.tensor(
-                [self.config.max_num_inp_token_per_rank for i in range(self.world_size)]
+                [max_num_token for i in range(self.world_size)]
             ).to(self.device)
         else:
             num_token = torch.randint(
                 1,
-                self.config.max_num_inp_token_per_rank + 1,
+                max_num_token + 1,
                 [self.world_size],
                 generator=self.rng,
                 device=self.device,
@@ -158,19 +158,21 @@ def gen_test_data(self, use_max_token_num=False):
                 device=self.device,
             )
             # argsort gives us a random permutation, take first K columns
-            indices = torch.argsort(random_vals, dim=1)[:, : self.config.num_experts_per_token]
+            indices = torch.argsort(random_vals, dim=1)[
+                :, : self.config.num_experts_per_token
+            ]
             all_rank_indices.append(indices.to(torch.int32))
 
         # num_total_experts = self.config.num_experts_per_rank * self.config.world_size
         # num_nodes = self.config.world_size // self.config.gpu_per_node
 
         # even_indices = (
         #     torch.arange(
-        #         self.config.max_num_inp_token_per_rank
+        #         max_num_token
         #         * self.config.num_experts_per_token,
         #         device="cuda",
         #     ).view(
-        #         self.config.max_num_inp_token_per_rank,
+        #         max_num_token,
         #         self.config.num_experts_per_token,
         #     )
         #     % 256
@@ -420,7 +422,10 @@ def test_dispatch_combine(self):
         for i in range(5000):
             if self.rank == 0:
                 print(f"Round {i} begin")
-            test_data = self.gen_test_data(use_max_token_num=False)
+            test_data = self.gen_test_data(
+                max_num_token=self.config.max_num_inp_token_per_rank,
+                use_max_token_num=False,
+            )
             if self.rank == 0:
                 print(f"Round {i} gen test_data done")
             self.run_test_once(op, test_data, error_round, i)
@@ -443,7 +448,11 @@ def stress_dispatch_combine(self):
         if self.rank == 0:
             print("Stress Test")
         test_data_list = [
-            self.gen_test_data(use_max_token_num=False) for i in range(num_test_data)
+            self.gen_test_data(
+                max_num_token=self.config.max_num_inp_token_per_rank,
+                use_max_token_num=False,
+            )
+            for i in range(num_test_data)
         ]
         for i in tqdm(range(5000)):
             (
@@ -480,7 +489,10 @@ def stress_dispatch_combine(self):
 
         if self.rank == 0:
             print("Stress Test with CUDA Graph")
-        test_data = self.gen_test_data(use_max_token_num=False)
+        test_data = self.gen_test_data(
+            max_num_token=self.config.max_num_inp_token_per_rank,
+            use_max_token_num=False,
+        )
         (
             all_rank_num_token,
             all_rank_indices,
@@ -520,7 +532,7 @@ def stress_dispatch_combine(self):
 
         del op
 
-    def run_bench_once(self, op, test_data, repeat=10):
+    def run_bench_once(self, max_num_token, op, test_data, repeat=10):
         num_events = 2 * repeat + 1
         events = [torch.cuda.Event(enable_timing=True) for i in range(num_events)]
 
@@ -559,9 +571,7 @@ def run_bench_once(self, op, test_data, repeat=10):
             )
             torch.cuda.synchronize()
 
-        total_rdma_recv_num_token = (
-            self.config.max_num_inp_token_per_rank * self.config.world_size // 8
-        )
+        total_rdma_recv_num_token = max_num_token * self.config.world_size // 8
         print(
             f"rank {self.rank} recv {total_recv_num_token} tokens {total_rdma_recv_num_token} rdma tokens"
         )
@@ -598,7 +608,7 @@ def run_bench_once(self, op, test_data, repeat=10):
         element_size = all_rank_input[self.rank].element_size()
         total_bytes = total_recv_num_token * self.config.hidden_dim * element_size
         ll_mode_scale = (
-            self.config.max_num_inp_token_per_rank
+            max_num_token
             * self.config.num_experts_per_token
             / (total_recv_num_token + 1)  # avoid division by zero
         )
@@ -635,9 +645,11 @@ def run_bench_once(self, op, test_data, repeat=10):
             ll_mode_scale,
         )
 
-    def bench_dispatch_combine(self):
+    def bench_dispatch_combine(self, max_num_token):
         op = mori.ops.EpDispatchCombineOp(self.config)
-        test_data = self.gen_test_data(use_max_token_num=True)
+        test_data = self.gen_test_data(
+            max_num_token=max_num_token, use_max_token_num=True
+        )
 
         repeat = 50
         disp_duration_us_list = []
@@ -664,7 +676,7 @@ def bench_dispatch_combine(self):
             comb_rdma_bandwidth,
             comb_bandwidth,
             ll_mode_scale,
-        ) = self.run_bench_once(op, test_data, repeat)
+        ) = self.run_bench_once(max_num_token, op, test_data, repeat)
 
         for i in range(repeat):
             disp_duration_output = [torch.zeros(1) for _ in range(self.world_size)]
@@ -821,14 +833,29 @@ def collect_metrics(per_round_data):
 
         del op
 
+        return (disp_bw, disp_rdma_bw, disp_ll_bw, disp_lat), (
+            comb_bw,
+            comb_rdma_bw,
+            comb_ll_bw,
+            comb_lat,
+        )
 
-def test_dispatch_combine(
-    local_rank, num_node, gpu_per_node, max_tokens, kernel_type, num_qp, cmd="test"
+
+def sweep_bench_dispatch_combine(
+    local_rank,
+    num_node,
+    gpu_per_node,
+    max_tokens,
+    kernel_type,
+    num_qp,
+    sweep_token_interval,
 ):
     world_size = num_node * gpu_per_node
     node_rank = int(os.environ["RANK"])
     global_rank = node_rank * gpu_per_node + local_rank
-
+    sweep_token_interval = int(sweep_token_interval)
+    if sweep_token_interval <= 0:
+        raise ValueError(f"sweep_token_interval must >= 1, got {sweep_token_interval}")
     test_case = EpDispatchCombineTestCase(
         global_rank,
         gpu_per_node,
@@ -840,32 +867,122 @@ def test_dispatch_combine(
         # torch.float8_e4m3fnuz,
     )
     test_case.setup()
-    if cmd == "test":
-        test_case.test_dispatch_combine()
-    elif cmd == "bench":
-        test_case.bench_dispatch_combine()
-    elif cmd == "stress":
-        test_case.stress_dispatch_combine()
+
+    num_iters = (max_tokens + sweep_token_interval - 1) // sweep_token_interval
+    max_token_list = [i * sweep_token_interval for i in range(num_iters)]
+
+    disp_lat_min_list = []
+    disp_lat_max_list = []
+    comb_lat_min_list = []
+    comb_lat_max_list = []
+    for max_token in max_token_list:
+        if max_token == 0:
+            max_token = 1
+        disp_stats, comb_stats = test_case.bench_dispatch_combine(max_token)
+        disp_bw, disp_rdma_bw, disp_ll_bw, disp_lat = disp_stats
+        comb_bw, comb_rdma_bw, comb_ll_bw, comb_lat = comb_stats
+
+        disp_lat_min_list.append(disp_lat[0])
+        comb_lat_min_list.append(comb_lat[0])
+        disp_lat_max_list.append(disp_lat[1])
+        comb_lat_max_list.append(comb_lat[1])
+
+    if local_rank == 0:
+        import matplotlib.pyplot as plt
+
+        plt.figure()
+        # plt.plot(max_token_list, disp_lat_min_list, label='Dispatch Min')
+        # plt.plot(max_token_list, comb_lat_min_list, label='Combine Min')
+        # plt.plot(max_token_list, disp_lat_max_list, label='Dispatch Max')
+        # plt.plot(max_token_list, comb_lat_max_list, label='Combine Max')
+        plt.plot(
+            max_token_list,
+            [max - min for max, min in zip(disp_lat_max_list, disp_lat_min_list)],
+            label="Dispatch Max-Min",
+        )
+        plt.plot(
+            max_token_list,
+            [max - min for max, min in zip(comb_lat_max_list, comb_lat_min_list)],
+            label="Combine Max-Min",
+        )
+        plt.xticks([i * 16 for i in range(max_tokens // 16)])
+        plt.title("Dispatch / Combine Max-Min Latency (us)")
+        plt.xlabel("# of Tokens")
+        plt.ylabel("Latency (us)")
+        plt.grid(True)
+        plt.legend()
+        plt.tight_layout()
+        plt.savefig("dispatch_combine_perf_maxmin.png", dpi=300, bbox_inches="tight")
+        test_case.cleanup()
+
+
+def test_dispatch_combine(
+    local_rank,
+    num_node,
+    gpu_per_node,
+    max_tokens,
+    kernel_type,
+    num_qp,
+    cmd="test",
+    sweep_token_interval=64,
+):
+    world_size = num_node * gpu_per_node
+    node_rank = int(os.environ["RANK"])
+    global_rank = node_rank * gpu_per_node + local_rank
+
+    if cmd in ("test", "bench", "stress"):
+        test_case = EpDispatchCombineTestCase(
+            global_rank,
+            gpu_per_node,
+            world_size,
+            max_tokens,
+            kernel_type,
+            num_qp,
+            torch.bfloat16,
+            # torch.float8_e4m3fnuz,
+        )
+        test_case.setup()
+        if cmd == "test":
+            test_case.test_dispatch_combine()
+        elif cmd == "bench":
+            test_case.bench_dispatch_combine(max_tokens)
+        elif cmd == "stress":
+            test_case.stress_dispatch_combine()
+        test_case.cleanup()
+    elif cmd == "sweep_bench":
+        sweep_bench_dispatch_combine(
+            local_rank,
+            num_node,
+            gpu_per_node,
+            max_tokens,
+            kernel_type,
+            num_qp,
+            sweep_token_interval,
+        )
     else:
         raise ValueError(f"unsupported command: {cmd}")
 
-    test_case.cleanup()
-
 
 parser = argparse.ArgumentParser(description="dispatch/combine internode test")
 parser.add_argument(
     "--cmd",
     type=str,
     default="test",
-    choices=["test", "bench", "stress"],
-    help="Available subcommands: test, bench, stress",
+    choices=["test", "bench", "stress", "sweep_bench"],
+    help="Available subcommands: test, bench, stress, sweep_bench",
 )
 parser.add_argument(
     "--max-tokens",
     type=int,
     default=4096,
     help="Maximum number of input tokens per rank (default: 4096)",
 )
+parser.add_argument(
+    "--sweep-token-interval",
+    type=int,
+    default=2,
+    help="Number of token interval when sweep bench",
+)
 parser.add_argument(
     "--kernel-type",
     type=str,
@@ -896,6 +1013,7 @@ def test_dispatch_combine(
             args_cli.kernel_type,
             args_cli.num_qp,
             args_cli.cmd,
+            args_cli.sweep_token_interval,
         ),
         nprocs=gpu_per_node,
         join=True,
 
@@ -165,6 +165,7 @@ class EpDispatchCombineHandle {
  public:
   // Number of tokens on this rank and size of scale data type, updated at each round of inference
   index_t curRankNumToken{0};
+  index_t multiProcessorCount{0};
 
  public:
   // Config
 
@@ -0,0 +1,38 @@
+// Copyright © Advanced Micro Devices, Inc. All rights reserved.
+//
+// MIT License
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace mori {
+inline int GetMultiProcessorCount(int device) {
+  hipDeviceProp_t prop;
+  HIP_RUNTIME_CHECK(hipGetDeviceProperties(&prop, device));
+  return prop.multiProcessorCount;
+}
+
+inline int GetCurDeviceMultiProcessorCount() {
+  int device = 0;
+  HIP_RUNTIME_CHECK(hipGetDevice(&device));
+  return GetMultiProcessorCount(device);
+}
+}  // namespace mori