From 63e03f1b4c8e4f9d0b62b33d9659a10f0eee22df Mon Sep 17 00:00:00 2001
From: littledgg <1658565283@qq.com>
Date: Mon, 17 Nov 2025 15:39:10 +0800
Subject: [PATCH 1/5] fix bug about CubKeyValueSorter::run

---
 custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
index 254f80e670d..7d437402741 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
@@ -45,7 +45,7 @@ class CubKeyValueSorter {
   size_t getWorkspaceSize(const size_t num_key_value_pairs,
                           bool descending = false) {
     num_key_value_pairs_ = num_key_value_pairs;
-    size_t required_storage = 0;
+    size_t required_storage = 1;
     int* null_int = nullptr;
     if (descending) {
       cub::DeviceRadixSort::SortPairsDescending(NULL,

From 876006e4b2796311f23102b93f6605d5419eaeeb Mon Sep 17 00:00:00 2001
From: littledgg <1658565283@qq.com>
Date: Wed, 19 Nov 2025 17:21:28 +0800
Subject: [PATCH 2/5] pre-commit and add comment

---
 custom_ops/gpu_ops/moe/moe_dispatch.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/custom_ops/gpu_ops/moe/moe_dispatch.cu b/custom_ops/gpu_ops/moe/moe_dispatch.cu
index 6bdb4c73e8f..7ec887a3091 100644
--- a/custom_ops/gpu_ops/moe/moe_dispatch.cu
+++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu
@@ -87,6 +87,13 @@ void MoeDispatchKernel(
   int8_t *sorter_ws_ptr = reinterpret_cast<int8_t *>(ws_ptr + bytes);
   int *permuted_experts_ =
       reinterpret_cast<int *>(sorter_ws_ptr + sorter_ws_size_bytes);
+  // If expected_ws_size > workspace_size ever occurs in sorter_.run (which
+  // should be practically impossible), there is a contiguous, currently unused
+  // region (permuted_experts_) right after sorter_ws_ptr. In practice, this
+  // region is larger than what cub::DeviceRadixSort::SortPairs requires.
+  // However, relying on this to “work” after canceling the assertion is unsafe:
+  // it constitutes undefined behavior, and there is no guarantee it will remain
+  // correct across inputs, CUDA/CUB versions, or architectures.
   int *permuted_rows_ = permuted_experts_ + num_moe_inputs;
 
   int *topk_idx_ptr = topk_idx->data<int>();

From ce956922d9a34919305a5194b2fe47932f61915a Mon Sep 17 00:00:00 2001
From: littledgg <1658565283@qq.com>
Date: Wed, 19 Nov 2025 17:23:14 +0800
Subject: [PATCH 3/5] pre-commit

---
 custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
index 7d437402741..c3dbdca3114 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
@@ -16,8 +16,8 @@
  */
 
 #pragma once
-#include <string>
 #include <sstream>
+#include <string>
 #include "cub/cub.cuh"
 
 namespace phi {

From b691f5f70e442ddc248773aa013b0b05ded64ce6 Mon Sep 17 00:00:00 2001
From: Jundong Liu <61149469+littledgg@users.noreply.github.com>
Date: Thu, 20 Nov 2025 10:56:58 +0800
Subject: [PATCH 4/5] Apply suggestion from @Copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
index c3dbdca3114..cdbaead7e30 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
@@ -45,6 +45,8 @@ class CubKeyValueSorter {
   size_t getWorkspaceSize(const size_t num_key_value_pairs,
                           bool descending = false) {
     num_key_value_pairs_ = num_key_value_pairs;
+    // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not write 
+    // to required_storage, and 1 is the minimum expected size in that scenario.
     size_t required_storage = 1;
     int* null_int = nullptr;
     if (descending) {

From 79e7ec86aa700d2e0c220a5c13381d0323f7bec8 Mon Sep 17 00:00:00 2001
From: littledgg <1658565283@qq.com>
Date: Thu, 20 Nov 2025 11:03:32 +0800
Subject: [PATCH 5/5] fix precommit

---
 custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
index cdbaead7e30..aec98d2592d 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
@@ -45,8 +45,9 @@ class CubKeyValueSorter {
   size_t getWorkspaceSize(const size_t num_key_value_pairs,
                           bool descending = false) {
     num_key_value_pairs_ = num_key_value_pairs;
-    // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not write 
-    // to required_storage, and 1 is the minimum expected size in that scenario.
+    // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not
+    // write to required_storage, and 1 is the minimum expected size in that
+    // scenario.
     size_t required_storage = 1;
     int* null_int = nullptr;
     if (descending) {