diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc index c055bfb873d..073c9990a3d 100644 --- a/custom_ops/xpu_ops/src/ops/block_attn.cc +++ b/custom_ops/xpu_ops/src/ops/block_attn.cc @@ -158,7 +158,7 @@ std::vector BlockAttnKernel( rope_head_dim = rotary_embs.dims()[4]; } std::string pos_emb_type; - if (use_neox_rotary_style == true) { + if (use_neox_rotary_style) { pos_emb_type = "NEOX"; } else if (rope_head_dim == head_dim / 2) { pos_emb_type = "HALF_HEAD_DIM"; @@ -344,12 +344,14 @@ std::vector BlockAttnKernel( value_cache.data())), vsl.usual_lod_vp, // seq_lod vsl.slot_mapping_vp, // real_batch + prefix_lens_vp, // start_tokens param.batch_size, // batch_size 1, // emb_batch_size rope_max_seqlen, // max_seqlen param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, @@ -600,14 +602,16 @@ std::vector BlockAttnKernel( key_cache.data())), const_cast(reinterpret_cast( value_cache.data())), - decoder_seq_lod_vp, // seq_lod - decoder_batch_map_vp, // real_batch - param.batch_size, // batch_size - 1, // emb_batch_size - rope_max_seqlen, // max_seqlen + decoder_seq_lod_vp, // seq_lod + decoder_batch_map_vp, // real_batch + decoder_context_len_cache_vp, // start_tokens + param.batch_size, // batch_size + 1, // emb_batch_size + rope_max_seqlen, // max_seqlen param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, @@ -808,6 +812,7 @@ std::vector BlockAttnKernel( param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, diff --git a/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc b/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc index 66dbe7d3f9d..5e107571072 100644 --- a/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc +++ b/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc @@ -76,19 +76,19 @@ std::vector> FusedNoAuxTcInferShape( const float routed_scaling_factor) { std::vector topk_ids_shape = {gating_logits_shape[0], top_k}; std::vector topk_weights_shape = {gating_logits_shape[0], top_k}; - return {gating_logits_shape, topk_ids_shape, topk_weights_shape}; + return {gating_logits_shape, topk_weights_shape, topk_ids_shape}; } std::vector FusedNoAuxTcInferDtype( const paddle::DataType& gating_logits_dtype, const paddle::DataType& bias_dtype) { return { - gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32}; + gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32}; } PD_BUILD_STATIC_OP(fused_noaux_tc) .Inputs({"gating_logits", "bias"}) - .Outputs({"gating_logits_out", "topk_ids", "topk_weights"}) + .Outputs({"gating_logits_out", "topk_weights", "topk_ids"}) .Attrs({"n_group: int", "topk_group: int", "top_k: int", diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 085c202c9a2..f2c37452ca4 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -313,7 +313,7 @@ def apply_tp( """ gate_out = gate(x.cast("float32")) if layer.topk_method == "noaux_tc": - _, topk_idx, topk_weights = get_moe_scores( + _, topk_weights, topk_idx = get_moe_scores( gate_out, layer.n_group, layer.topk_group, diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 2bee885ff43..3d0377900f4 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -61,7 +61,8 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs): ) if self.model_format == "torch" and "output_dim" in extra_weight_attrs: - extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] + if extra_weight_attrs["output_dim"] is not None: + extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] set_weight_attrs( layer.weight, diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index af7203ed6f1..9aa73f87c57 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -43,7 +43,7 @@ def __call__(self, position_ids): inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) partial_rotary_position_ids = position_ids / self.partial_rotary_factor freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq) - if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): + if current_platform.is_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): # shape: [B, S, D] rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") emb = paddle.stack([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim)) @@ -89,9 +89,14 @@ def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) - # shape: [B, S, D/2] - rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") - emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2)) + if current_platform.is_xpu(): + # shape: [B, S, D] + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") + emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim)) + else: + # shape: [B, S, D/2] + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") + emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2)) # shape: [B, S, 1, D] emb = paddle.unsqueeze(emb, 2) rot_emb[0] = paddle.cos(emb) diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 3f45e9df614..e49fe6d59f1 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -71,7 +71,7 @@ def __init__( fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=[intermediate_size, intermediate_size], + output_sizes=[intermediate_size, intermediate_size], with_bias=False, ) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index a984e8788c4..fa04e2b7983 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -990,6 +990,7 @@ def _init_share_inputs(self, max_num_seqs: int): position_ids=tmp_position_ids, base=self.model_config.rope_theta, model_config=self.model_config, + partial_rotary_factor=self.model_config.partial_rotary_factor, ) # Set block tables