From 63e03f1b4c8e4f9d0b62b33d9659a10f0eee22df Mon Sep 17 00:00:00 2001 From: littledgg <1658565283@qq.com> Date: Mon, 17 Nov 2025 15:39:10 +0800 Subject: [PATCH 1/5] fix bug about CubKeyValueSorter::run --- custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h index 254f80e670d..7d437402741 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h @@ -45,7 +45,7 @@ class CubKeyValueSorter { size_t getWorkspaceSize(const size_t num_key_value_pairs, bool descending = false) { num_key_value_pairs_ = num_key_value_pairs; - size_t required_storage = 0; + size_t required_storage = 1; int* null_int = nullptr; if (descending) { cub::DeviceRadixSort::SortPairsDescending(NULL, From 876006e4b2796311f23102b93f6605d5419eaeeb Mon Sep 17 00:00:00 2001 From: littledgg <1658565283@qq.com> Date: Wed, 19 Nov 2025 17:21:28 +0800 Subject: [PATCH 2/5] pre-commit and add comment --- custom_ops/gpu_ops/moe/moe_dispatch.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/custom_ops/gpu_ops/moe/moe_dispatch.cu b/custom_ops/gpu_ops/moe/moe_dispatch.cu index 6bdb4c73e8f..7ec887a3091 100644 --- a/custom_ops/gpu_ops/moe/moe_dispatch.cu +++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu @@ -87,6 +87,13 @@ void MoeDispatchKernel( int8_t *sorter_ws_ptr = reinterpret_cast(ws_ptr + bytes); int *permuted_experts_ = reinterpret_cast(sorter_ws_ptr + sorter_ws_size_bytes); + // If expected_ws_size > workspace_size ever occurs in sorter_.run (which + // should be practically impossible), there is a contiguous, currently unused + // region (permuted_experts_) right after sorter_ws_ptr. In practice, this + // region is larger than what cub::DeviceRadixSort::SortPairs requires. + // However, relying on this to “work” after canceling the assertion is unsafe: + // it constitutes undefined behavior, and there is no guarantee it will remain + // correct across inputs, CUDA/CUB versions, or architectures. int *permuted_rows_ = permuted_experts_ + num_moe_inputs; int *topk_idx_ptr = topk_idx->data(); From ce956922d9a34919305a5194b2fe47932f61915a Mon Sep 17 00:00:00 2001 From: littledgg <1658565283@qq.com> Date: Wed, 19 Nov 2025 17:23:14 +0800 Subject: [PATCH 3/5] pre-commit --- custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h index 7d437402741..c3dbdca3114 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h @@ -16,8 +16,8 @@ */ #pragma once -#include #include +#include #include "cub/cub.cuh" namespace phi { From b691f5f70e442ddc248773aa013b0b05ded64ce6 Mon Sep 17 00:00:00 2001 From: Jundong Liu <61149469+littledgg@users.noreply.github.com> Date: Thu, 20 Nov 2025 10:56:58 +0800 Subject: [PATCH 4/5] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h index c3dbdca3114..cdbaead7e30 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h @@ -45,6 +45,8 @@ class CubKeyValueSorter { size_t getWorkspaceSize(const size_t num_key_value_pairs, bool descending = false) { num_key_value_pairs_ = num_key_value_pairs; + // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not write + // to required_storage, and 1 is the minimum expected size in that scenario. size_t required_storage = 1; int* null_int = nullptr; if (descending) { From 79e7ec86aa700d2e0c220a5c13381d0323f7bec8 Mon Sep 17 00:00:00 2001 From: littledgg <1658565283@qq.com> Date: Thu, 20 Nov 2025 11:03:32 +0800 Subject: [PATCH 5/5] fix precommit --- custom_ops/gpu_ops/moe/fused_moe_imp_op.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h index cdbaead7e30..aec98d2592d 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h @@ -45,8 +45,9 @@ class CubKeyValueSorter { size_t getWorkspaceSize(const size_t num_key_value_pairs, bool descending = false) { num_key_value_pairs_ = num_key_value_pairs; - // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not write - // to required_storage, and 1 is the minimum expected size in that scenario. + // Initialize to 1 as workaround: under CUDA Graph capture, CUB may not + // write to required_storage, and 1 is the minimum expected size in that + // scenario. size_t required_storage = 1; int* null_int = nullptr; if (descending) {