PaddlePaddle · BingooYang · Mar 19, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -691,6 +691,10 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
                                     bool renormalize,
                                     float routed_scaling_factor);
 
+std::vector<paddle::Tensor> FusedCastSigmoidBias(const paddle::Tensor& input,
+                                                 const paddle::Tensor& bias,
+                                                 std::string cast_type);
+
 std::vector<paddle::Tensor> NoauxTcRedundant(
     paddle::Tensor& scores,
     paddle::Tensor& scores_with_bias,
@@ -1687,6 +1691,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
 
+  m.def("fused_cast_sigmoid_bias",
+        &FusedCastSigmoidBias,
+        "Fused cast+sigmoid+bias for MoE gating scores",
+        py::arg("input"),
+        py::arg("bias"),
+        py::arg("cast_type") = std::string("float32"));
+
   m.def("noaux_tc_redundant",
         &NoauxTcRedundant,
         "noaux_tc_redundant for MoE compute");

diff --git a/custom_ops/gpu_ops/fused_cast_sigmoid_bias.cu b/custom_ops/gpu_ops/fused_cast_sigmoid_bias.cu
@@ -0,0 +1,185 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+
+// Fused kernel: cast(input, cast_type) -> sigmoid -> scores, scores + bias ->
+// scores_with_bias
+//
+// For each element (token i, expert j):
+//   scores[i][j] = OutT(sigmoid(float(input[i][j])))
+//   scores_with_bias[i][j] = OutT(sigmoid(float(input[i][j])) + bias[j])
+//
+// Input:  input [num_tokens, num_experts] bf16/fp16/fp32
+//         bias  [num_experts] or [1, num_experts] fp32
+// Output: scores [num_tokens, num_experts] cast_type (fp32/fp16/bf16)
+//         scores_with_bias [num_tokens, num_experts] cast_type (fp32/fp16/bf16)
+
+template <typename InT, typename OutT>
+__global__ void fused_cast_sigmoid_bias_kernel(
+    const InT* __restrict__ input,
+    const float* __restrict__ bias,
+    OutT* __restrict__ scores,
+    OutT* __restrict__ scores_with_bias,
+    const int num_experts) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t offset = token_idx * num_experts;
+
+  for (int j = threadIdx.x; j < num_experts; j += blockDim.x) {
+    float val = static_cast<float>(input[offset + j]);
+    // sigmoid: 1 / (1 + exp(-x))
+    float s = 1.0f / (1.0f + expf(-val));
+    scores[offset + j] = static_cast<OutT>(s);
+    scores_with_bias[offset + j] = static_cast<OutT>(s + bias[j]);
+  }
+}
+
+// Vectorized version for better memory throughput
+template <typename InT, typename OutT, int kVecSize>
+__global__ void fused_cast_sigmoid_bias_vec_kernel(
+    const InT* __restrict__ input,
+    const float* __restrict__ bias,
+    OutT* __restrict__ scores,
+    OutT* __restrict__ scores_with_bias,
+    const int num_experts) {
+  const int64_t token_idx = blockIdx.x;
+  const int64_t offset = token_idx * num_experts;
+
+  using in_vec_t = AlignedVector<InT, kVecSize>;
+  using out_vec_t = AlignedVector<OutT, kVecSize>;
+  using bias_vec_t = AlignedVector<float, kVecSize>;
+
+  const int vec_count = num_experts / kVecSize;
+  for (int idx = threadIdx.x; idx < vec_count; idx += blockDim.x) {
+    const int base = idx * kVecSize;
+    in_vec_t in_vec;
+    bias_vec_t bias_vec;
+    Load(input + offset + base, &in_vec);
+    Load(bias + base, &bias_vec);
+
+    out_vec_t s_vec, sb_vec;
+#pragma unroll
+    for (int i = 0; i < kVecSize; ++i) {
+      float val = static_cast<float>(in_vec[i]);
+      float s = 1.0f / (1.0f + expf(-val));
+      s_vec[i] = static_cast<OutT>(s);
+      sb_vec[i] = static_cast<OutT>(s + bias_vec[i]);
+    }
+
+    Store(s_vec, scores + offset + base);
+    Store(sb_vec, scores_with_bias + offset + base);
+  }
+
+  // Handle remaining elements
+  const int remaining_start = vec_count * kVecSize;
+  for (int j = remaining_start + threadIdx.x; j < num_experts;
+       j += blockDim.x) {
+    float val = static_cast<float>(input[offset + j]);
+    float s = 1.0f / (1.0f + expf(-val));
+    scores[offset + j] = static_cast<OutT>(s);
+    scores_with_bias[offset + j] = static_cast<OutT>(s + bias[j]);
+  }
+}
+
+static paddle::DataType ParseCastType(const std::string& cast_type) {
+  if (cast_type == "float32") return paddle::DataType::FLOAT32;
+  if (cast_type == "float16") return paddle::DataType::FLOAT16;
+  if (cast_type == "bfloat16") return paddle::DataType::BFLOAT16;
+  PD_THROW("Unsupported cast_type: " + cast_type +
+           ". Only float32, float16, bfloat16 are supported.");
+}
+
+std::vector<paddle::Tensor> FusedCastSigmoidBias(const paddle::Tensor& input,
+                                                 const paddle::Tensor& bias,
+                                                 std::string cast_type) {
+  auto input_shape = input.shape();
+  PD_CHECK(input_shape.size() == 2,
+           "input must be 2D [num_tokens, num_experts]");
+  auto bias_shape = bias.shape();
+  // Support both [num_experts] and [1, num_experts] bias shapes
+  PD_CHECK(
+      bias_shape.size() == 1 || (bias_shape.size() == 2 && bias_shape[0] == 1),
+      "bias must be 1D [num_experts] or 2D [1, num_experts]");
+
+  int64_t num_tokens = input_shape[0];
+  int64_t num_experts = input_shape[1];
+  int64_t bias_numel = (bias_shape.size() == 1) ? bias_shape[0] : bias_shape[1];
+  PD_CHECK(bias_numel == num_experts, "bias size must match num_experts");
+  PD_CHECK(bias.dtype() == paddle::DataType::FLOAT32,
+           "bias must be float32, got ",
+           bias.dtype());
+
+  auto place = input.place();
+  auto stream = input.stream();
+  auto out_dtype = ParseCastType(cast_type);
+
+  auto scores = paddle::empty({num_tokens, num_experts}, out_dtype, place);
+  auto scores_with_bias =
+      paddle::empty({num_tokens, num_experts}, out_dtype, place);
+
+  if (num_tokens == 0) {
+    return {scores, scores_with_bias};
+  }
+
+  dim3 grid(num_tokens);
+  int block_size = std::min(static_cast<int64_t>(1024), num_experts);
+  // Round up to warp size
+  block_size = ((block_size + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
+  dim3 block(block_size);
+
+  DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), in_scalar_t, {
+    DISPATCH_FLOAT_FP6_DTYPE(out_dtype, out_scalar_t, {
+      constexpr int kVecSize = 16 / sizeof(in_scalar_t);
+      if (num_experts % kVecSize == 0 && num_experts >= kVecSize) {
+        fused_cast_sigmoid_bias_vec_kernel<in_scalar_t, out_scalar_t, kVecSize>
+            <<<grid, block, 0, stream>>>(input.data<in_scalar_t>(),
+                                         bias.data<float>(),
+                                         scores.data<out_scalar_t>(),
+                                         scores_with_bias.data<out_scalar_t>(),
+                                         num_experts);
+      } else {
+        fused_cast_sigmoid_bias_kernel<in_scalar_t, out_scalar_t>
+            <<<grid, block, 0, stream>>>(input.data<in_scalar_t>(),
+                                         bias.data<float>(),
+                                         scores.data<out_scalar_t>(),
+                                         scores_with_bias.data<out_scalar_t>(),
+                                         num_experts);
+      }
+    });
+  });
+
+  return {scores, scores_with_bias};
+}
+
+std::vector<paddle::DataType> FusedCastSigmoidBiasInferDtype(
+    const paddle::DataType& input_dtype,
+    const paddle::DataType& bias_dtype,
+    std::string cast_type) {
+  auto out_dtype = ParseCastType(cast_type);
+  return {out_dtype, out_dtype};
+}
+
+std::vector<std::vector<int64_t>> FusedCastSigmoidBiasInferShape(
+    const std::vector<int64_t>& input_shape,
+    const std::vector<int64_t>& bias_shape) {
+  return {input_shape, input_shape};
+}
+
+PD_BUILD_STATIC_OP(fused_cast_sigmoid_bias)
+    .Inputs({"input", "bias"})
+    .Outputs({"scores", "scores_with_bias"})
+    .Attrs({"cast_type: std::string"})
+    .SetKernelFn(PD_KERNEL(FusedCastSigmoidBias))
+    .SetInferShapeFn(PD_INFER_SHAPE(FusedCastSigmoidBiasInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(FusedCastSigmoidBiasInferDtype));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -330,6 +330,7 @@ def find_end_files(directory, end_str):
         "gpu_ops/fused_rotary_position_encoding.cu",
         "gpu_ops/noaux_tc.cu",
         "gpu_ops/noaux_tc_redundant.cu",
+        "gpu_ops/fused_cast_sigmoid_bias.cu",
         "gpu_ops/custom_all_reduce/all_reduce.cu",
         "gpu_ops/merge_prefill_decode_output.cu",
         "gpu_ops/limit_thinking_content_length.cu",
@@ -682,6 +683,7 @@ def find_end_files(directory, end_str):
         "gpu_ops/recover_decode_task.cu",
         "gpu_ops/noaux_tc.cu",
         "gpu_ops/noaux_tc_redundant.cu",
+        "gpu_ops/fused_cast_sigmoid_bias.cu",
         "gpu_ops/fused_rotary_position_encoding.cu",
         "gpu_ops/text_image_gather_scatter.cu",
         "gpu_ops/text_image_index_out.cu",

diff --git a/fastdeploy/model_executor/layers/moe/fused_cast_sigmoid_bias.py b/fastdeploy/model_executor/layers/moe/fused_cast_sigmoid_bias.py
@@ -0,0 +1,46 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+
+from fastdeploy.model_executor.ops.gpu import (
+    fused_cast_sigmoid_bias as _fused_cast_sigmoid_bias_cuda,
+)
+
+
+def fused_cast_sigmoid_bias(
+    gate_out: paddle.Tensor,
+    e_score_correction_bias: paddle.Tensor,
+    cast_type: str = "float32",
+) -> tuple:
+    """
+    融合操作：将gate_out转换为指定类型，应用sigmoid函数，并添加偏置。
+
+    该函数融合了以下三个独立操作：
+      1. gate_out = gate_out.cast(cast_type)
+      2. scores = sigmoid(gate_out)
+      3. scores_with_bias = scores + e_score_correction_bias
+
+    Args:
+        gate_out: [num_tokens, num_experts]，bf16/fp16/fp32类型 - 原始gate输出
+        e_score_correction_bias: [num_experts]，fp32类型 - 修正偏置
+        cast_type: 输出数据类型字符串，支持"float32"、"float16"、"bfloat16"
+
+    Returns:
+        scores: [num_tokens, num_experts]，cast_type类型 - sigmoid(gate_out)的结果
+        scores_with_bias: [num_tokens, num_experts]，cast_type类型 - 加上偏置后的分数
+    """
+    return _fused_cast_sigmoid_bias_cuda(gate_out, e_score_correction_bias, cast_type)
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py
@@ -275,7 +275,6 @@ def apply_tp(
         Paddle Cutlass compute Fused MoE.
         """
         gate_out = gate(x)
-        gate_out = gate_out.cast("float32")
         if layer.topk_method == "noaux_tc":
             gate_out, topk_weights, topk_idx = get_moe_scores(
                 gate_out,
@@ -285,6 +284,7 @@ def apply_tp(
                 layer.routed_scaling_factor,
                 layer.gate_correction_bias,
                 getattr(layer, "renormalize", True),
+                use_fused_cast=True,
             )
             (
                 permute_input,
@@ -308,6 +308,7 @@ def apply_tp(
                 topk_only_mode=True,
             )
         else:
+            gate_out = gate_out.cast("float32")
             (
                 permute_input,
                 token_nums_per_expert,

diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -39,6 +39,16 @@
     from fastdeploy.model_executor.ops.gpu import noaux_tc, noaux_tc_redundant
 except:
     logger.warning("import noaux_tc Failed!")
+
+try:
+    from fastdeploy.model_executor.layers.moe.fused_cast_sigmoid_bias import (
+        fused_cast_sigmoid_bias,
+    )
+
+    _FUSED_CAST_SIGMOID_BIAS_AVAILABLE = True
+except Exception:
+    _FUSED_CAST_SIGMOID_BIAS_AVAILABLE = False
+
 import numpy as np
 
 
@@ -90,13 +100,17 @@ def get_moe_scores(
     expert_in_rank_num_list: paddle.Tensor = None,
     tokens_per_expert_stats_list: paddle.Tensor = None,
     redundant_ep_rank_num_plus_one: int = 1,
+    use_fused_cast: bool = False,
 ) -> paddle.Tensor:
     """
     compute moe scores using e_score_correction_bias.
     """
-    scores = paddle.nn.functional.sigmoid(gating_output)
     assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
-    scores_with_bias = scores + e_score_correction_bias
+    if use_fused_cast and _FUSED_CAST_SIGMOID_BIAS_AVAILABLE:
+        scores, scores_with_bias = fused_cast_sigmoid_bias(gating_output, e_score_correction_bias)
+    else:
+        scores = paddle.nn.functional.sigmoid(gating_output)
+        scores_with_bias = scores + e_score_correction_bias
     if expert_id_to_ep_rank_array is None:
         scores, topk_values, topk_idx = noaux_tc(
             scores,