Skip to content
11 changes: 11 additions & 0 deletions custom_ops/gpu_ops/cpp_extensions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,10 @@ std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
bool renormalize,
float routed_scaling_factor);

std::vector<paddle::Tensor> FusedCastSigmoidBias(const paddle::Tensor& input,
const paddle::Tensor& bias,
std::string cast_type);

std::vector<paddle::Tensor> NoauxTcRedundant(
paddle::Tensor& scores,
paddle::Tensor& scores_with_bias,
Expand Down Expand Up @@ -1687,6 +1691,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute");

m.def("fused_cast_sigmoid_bias",
&FusedCastSigmoidBias,
"Fused cast+sigmoid+bias for MoE gating scores",
py::arg("input"),
py::arg("bias"),
py::arg("cast_type") = std::string("float32"));

m.def("noaux_tc_redundant",
&NoauxTcRedundant,
"noaux_tc_redundant for MoE compute");
Expand Down
185 changes: 185 additions & 0 deletions custom_ops/gpu_ops/fused_cast_sigmoid_bias.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "helper.h"

// Fused kernel: cast(input, cast_type) -> sigmoid -> scores, scores + bias ->
// scores_with_bias
//
// For each element (token i, expert j):
// scores[i][j] = OutT(sigmoid(float(input[i][j])))
// scores_with_bias[i][j] = OutT(sigmoid(float(input[i][j])) + bias[j])
//
// Input: input [num_tokens, num_experts] bf16/fp16/fp32
// bias [num_experts] or [1, num_experts] fp32
// Output: scores [num_tokens, num_experts] cast_type (fp32/fp16/bf16)
// scores_with_bias [num_tokens, num_experts] cast_type (fp32/fp16/bf16)

template <typename InT, typename OutT>
__global__ void fused_cast_sigmoid_bias_kernel(
const InT* __restrict__ input,
const float* __restrict__ bias,
OutT* __restrict__ scores,
OutT* __restrict__ scores_with_bias,
const int num_experts) {
const int64_t token_idx = blockIdx.x;
const int64_t offset = token_idx * num_experts;

for (int j = threadIdx.x; j < num_experts; j += blockDim.x) {
float val = static_cast<float>(input[offset + j]);
// sigmoid: 1 / (1 + exp(-x))
float s = 1.0f / (1.0f + expf(-val));
scores[offset + j] = static_cast<OutT>(s);
scores_with_bias[offset + j] = static_cast<OutT>(s + bias[j]);
}
}

// Vectorized version for better memory throughput
template <typename InT, typename OutT, int kVecSize>
__global__ void fused_cast_sigmoid_bias_vec_kernel(
const InT* __restrict__ input,
const float* __restrict__ bias,
OutT* __restrict__ scores,
OutT* __restrict__ scores_with_bias,
const int num_experts) {
const int64_t token_idx = blockIdx.x;
const int64_t offset = token_idx * num_experts;

using in_vec_t = AlignedVector<InT, kVecSize>;
using out_vec_t = AlignedVector<OutT, kVecSize>;
using bias_vec_t = AlignedVector<float, kVecSize>;

const int vec_count = num_experts / kVecSize;
for (int idx = threadIdx.x; idx < vec_count; idx += blockDim.x) {
const int base = idx * kVecSize;
in_vec_t in_vec;
bias_vec_t bias_vec;
Load(input + offset + base, &in_vec);
Load(bias + base, &bias_vec);

out_vec_t s_vec, sb_vec;
#pragma unroll
for (int i = 0; i < kVecSize; ++i) {
float val = static_cast<float>(in_vec[i]);
float s = 1.0f / (1.0f + expf(-val));
s_vec[i] = static_cast<OutT>(s);
sb_vec[i] = static_cast<OutT>(s + bias_vec[i]);
}

Store(s_vec, scores + offset + base);
Store(sb_vec, scores_with_bias + offset + base);
}

// Handle remaining elements
const int remaining_start = vec_count * kVecSize;
for (int j = remaining_start + threadIdx.x; j < num_experts;
j += blockDim.x) {
float val = static_cast<float>(input[offset + j]);
float s = 1.0f / (1.0f + expf(-val));
scores[offset + j] = static_cast<OutT>(s);
scores_with_bias[offset + j] = static_cast<OutT>(s + bias[j]);
}
}

static paddle::DataType ParseCastType(const std::string& cast_type) {
if (cast_type == "float32") return paddle::DataType::FLOAT32;
if (cast_type == "float16") return paddle::DataType::FLOAT16;
if (cast_type == "bfloat16") return paddle::DataType::BFLOAT16;
PD_THROW("Unsupported cast_type: " + cast_type +
". Only float32, float16, bfloat16 are supported.");
}

std::vector<paddle::Tensor> FusedCastSigmoidBias(const paddle::Tensor& input,
const paddle::Tensor& bias,
std::string cast_type) {
auto input_shape = input.shape();
PD_CHECK(input_shape.size() == 2,
"input must be 2D [num_tokens, num_experts]");
auto bias_shape = bias.shape();
// Support both [num_experts] and [1, num_experts] bias shapes
PD_CHECK(
bias_shape.size() == 1 || (bias_shape.size() == 2 && bias_shape[0] == 1),
"bias must be 1D [num_experts] or 2D [1, num_experts]");

int64_t num_tokens = input_shape[0];
int64_t num_experts = input_shape[1];
int64_t bias_numel = (bias_shape.size() == 1) ? bias_shape[0] : bias_shape[1];
PD_CHECK(bias_numel == num_experts, "bias size must match num_experts");
PD_CHECK(bias.dtype() == paddle::DataType::FLOAT32,
"bias must be float32, got ",
bias.dtype());

auto place = input.place();
auto stream = input.stream();
auto out_dtype = ParseCastType(cast_type);

auto scores = paddle::empty({num_tokens, num_experts}, out_dtype, place);
auto scores_with_bias =
paddle::empty({num_tokens, num_experts}, out_dtype, place);

if (num_tokens == 0) {
return {scores, scores_with_bias};
}

dim3 grid(num_tokens);
int block_size = std::min(static_cast<int64_t>(1024), num_experts);
// Round up to warp size
block_size = ((block_size + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
dim3 block(block_size);

DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), in_scalar_t, {
DISPATCH_FLOAT_FP6_DTYPE(out_dtype, out_scalar_t, {
constexpr int kVecSize = 16 / sizeof(in_scalar_t);
if (num_experts % kVecSize == 0 && num_experts >= kVecSize) {
fused_cast_sigmoid_bias_vec_kernel<in_scalar_t, out_scalar_t, kVecSize>
<<<grid, block, 0, stream>>>(input.data<in_scalar_t>(),
bias.data<float>(),
scores.data<out_scalar_t>(),
scores_with_bias.data<out_scalar_t>(),
num_experts);
} else {
fused_cast_sigmoid_bias_kernel<in_scalar_t, out_scalar_t>
<<<grid, block, 0, stream>>>(input.data<in_scalar_t>(),
bias.data<float>(),
scores.data<out_scalar_t>(),
scores_with_bias.data<out_scalar_t>(),
num_experts);
}
});
});

return {scores, scores_with_bias};
}

std::vector<paddle::DataType> FusedCastSigmoidBiasInferDtype(
const paddle::DataType& input_dtype,
const paddle::DataType& bias_dtype,
std::string cast_type) {
auto out_dtype = ParseCastType(cast_type);
return {out_dtype, out_dtype};
}

std::vector<std::vector<int64_t>> FusedCastSigmoidBiasInferShape(
const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& bias_shape) {
return {input_shape, input_shape};
}

PD_BUILD_STATIC_OP(fused_cast_sigmoid_bias)
.Inputs({"input", "bias"})
.Outputs({"scores", "scores_with_bias"})
.Attrs({"cast_type: std::string"})
.SetKernelFn(PD_KERNEL(FusedCastSigmoidBias))
.SetInferShapeFn(PD_INFER_SHAPE(FusedCastSigmoidBiasInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(FusedCastSigmoidBiasInferDtype));
2 changes: 2 additions & 0 deletions custom_ops/setup_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ def find_end_files(directory, end_str):
"gpu_ops/fused_rotary_position_encoding.cu",
"gpu_ops/noaux_tc.cu",
"gpu_ops/noaux_tc_redundant.cu",
"gpu_ops/fused_cast_sigmoid_bias.cu",
"gpu_ops/custom_all_reduce/all_reduce.cu",
"gpu_ops/merge_prefill_decode_output.cu",
"gpu_ops/limit_thinking_content_length.cu",
Expand Down Expand Up @@ -682,6 +683,7 @@ def find_end_files(directory, end_str):
"gpu_ops/recover_decode_task.cu",
"gpu_ops/noaux_tc.cu",
"gpu_ops/noaux_tc_redundant.cu",
"gpu_ops/fused_cast_sigmoid_bias.cu",
"gpu_ops/fused_rotary_position_encoding.cu",
"gpu_ops/text_image_gather_scatter.cu",
"gpu_ops/text_image_index_out.cu",
Expand Down
46 changes: 46 additions & 0 deletions fastdeploy/model_executor/layers/moe/fused_cast_sigmoid_bias.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import paddle

from fastdeploy.model_executor.ops.gpu import (
fused_cast_sigmoid_bias as _fused_cast_sigmoid_bias_cuda,
)


def fused_cast_sigmoid_bias(
gate_out: paddle.Tensor,
e_score_correction_bias: paddle.Tensor,
cast_type: str = "float32",
) -> tuple:
"""
融合操作:将gate_out转换为指定类型,应用sigmoid函数,并添加偏置。

该函数融合了以下三个独立操作:
1. gate_out = gate_out.cast(cast_type)
2. scores = sigmoid(gate_out)
3. scores_with_bias = scores + e_score_correction_bias

Args:
gate_out: [num_tokens, num_experts],bf16/fp16/fp32类型 - 原始gate输出
e_score_correction_bias: [num_experts],fp32类型 - 修正偏置
cast_type: 输出数据类型字符串,支持"float32"、"float16"、"bfloat16"

Returns:
scores: [num_tokens, num_experts],cast_type类型 - sigmoid(gate_out)的结果
scores_with_bias: [num_tokens, num_experts],cast_type类型 - 加上偏置后的分数
"""
return _fused_cast_sigmoid_bias_cuda(gate_out, e_score_correction_bias, cast_type)
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,6 @@ def apply_tp(
Paddle Cutlass compute Fused MoE.
"""
gate_out = gate(x)
gate_out = gate_out.cast("float32")
if layer.topk_method == "noaux_tc":
gate_out, topk_weights, topk_idx = get_moe_scores(
gate_out,
Expand All @@ -285,6 +284,7 @@ def apply_tp(
layer.routed_scaling_factor,
layer.gate_correction_bias,
getattr(layer, "renormalize", True),
use_fused_cast=True,
)
(
permute_input,
Expand All @@ -308,6 +308,7 @@ def apply_tp(
topk_only_mode=True,
)
else:
gate_out = gate_out.cast("float32")
(
permute_input,
token_nums_per_expert,
Expand Down
18 changes: 16 additions & 2 deletions fastdeploy/model_executor/layers/moe/moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@
from fastdeploy.model_executor.ops.gpu import noaux_tc, noaux_tc_redundant
except:
logger.warning("import noaux_tc Failed!")

try:
from fastdeploy.model_executor.layers.moe.fused_cast_sigmoid_bias import (
fused_cast_sigmoid_bias,
)

_FUSED_CAST_SIGMOID_BIAS_AVAILABLE = True
except Exception:
_FUSED_CAST_SIGMOID_BIAS_AVAILABLE = False

import numpy as np


Expand Down Expand Up @@ -90,13 +100,17 @@ def get_moe_scores(
expert_in_rank_num_list: paddle.Tensor = None,
tokens_per_expert_stats_list: paddle.Tensor = None,
redundant_ep_rank_num_plus_one: int = 1,
use_fused_cast: bool = False,
) -> paddle.Tensor:
"""
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
assert e_score_correction_bias is not None, "e_score_correction_bias is none!"
scores_with_bias = scores + e_score_correction_bias
if use_fused_cast and _FUSED_CAST_SIGMOID_BIAS_AVAILABLE:
scores, scores_with_bias = fused_cast_sigmoid_bias(gating_output, e_score_correction_bias)
else:
scores = paddle.nn.functional.sigmoid(gating_output)
scores_with_bias = scores + e_score_correction_bias
if expert_id_to_ep_rank_array is None:
scores, topk_values, topk_idx = noaux_tc(
scores,
Expand Down
Loading
Loading