Skip to content

Commit 0e1d9fc

Browse files
committed
[XPU] glm-4.5-air
1 parent 5f612a3 commit 0e1d9fc

File tree

7 files changed

+28
-16
lines changed

7 files changed

+28
-16
lines changed

custom_ops/xpu_ops/src/ops/block_attn.cc

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
156156
rope_head_dim = rotary_embs.dims()[4];
157157
}
158158
std::string pos_emb_type;
159-
if (use_neox_rotary_style == true) {
159+
if (use_neox_rotary_style) {
160160
pos_emb_type = "NEOX";
161161
} else if (rope_head_dim == head_dim / 2) {
162162
pos_emb_type = "HALF_HEAD_DIM";
@@ -342,12 +342,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
342342
value_cache.data<cdata_t>())),
343343
vsl.usual_lod_vp, // seq_lod
344344
vsl.slot_mapping_vp, // real_batch
345+
prefix_lens_vp, // start_tokens
345346
param.batch_size, // batch_size
346347
1, // emb_batch_size
347348
rope_max_seqlen, // max_seqlen
348349
param.head_num,
349350
param.kv_head_num,
350351
param.head_dim,
352+
rope_head_dim,
351353
param.max_batch_size,
352354
block_size,
353355
max_block_per_seq,
@@ -598,14 +600,16 @@ std::vector<paddle::Tensor> BlockAttnKernel(
598600
key_cache.data<cdata_t>())),
599601
const_cast<XPU_CType*>(reinterpret_cast<const XPU_CType*>(
600602
value_cache.data<cdata_t>())),
601-
decoder_seq_lod_vp, // seq_lod
602-
decoder_batch_map_vp, // real_batch
603-
param.batch_size, // batch_size
604-
1, // emb_batch_size
605-
rope_max_seqlen, // max_seqlen
603+
decoder_seq_lod_vp, // seq_lod
604+
decoder_batch_map_vp, // real_batch
605+
decoder_context_len_cache_vp, // start_tokens
606+
param.batch_size, // batch_size
607+
1, // emb_batch_size
608+
rope_max_seqlen, // max_seqlen
606609
param.head_num,
607610
param.kv_head_num,
608611
param.head_dim,
612+
rope_head_dim,
609613
param.max_batch_size,
610614
block_size,
611615
max_block_per_seq,
@@ -806,6 +810,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
806810
param.head_num,
807811
param.kv_head_num,
808812
param.head_dim,
813+
rope_head_dim,
809814
param.max_batch_size,
810815
block_size,
811816
max_block_per_seq,

custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,19 @@ std::vector<std::vector<int64_t>> FusedNoAuxTcInferShape(
7676
const float routed_scaling_factor) {
7777
std::vector<int64_t> topk_ids_shape = {gating_logits_shape[0], top_k};
7878
std::vector<int64_t> topk_weights_shape = {gating_logits_shape[0], top_k};
79-
return {gating_logits_shape, topk_ids_shape, topk_weights_shape};
79+
return {gating_logits_shape, topk_weights_shape, topk_ids_shape};
8080
}
8181

8282
std::vector<paddle::DataType> FusedNoAuxTcInferDtype(
8383
const paddle::DataType& gating_logits_dtype,
8484
const paddle::DataType& bias_dtype) {
8585
return {
86-
gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32};
86+
gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32};
8787
}
8888

8989
PD_BUILD_STATIC_OP(fused_noaux_tc)
9090
.Inputs({"gating_logits", "bias"})
91-
.Outputs({"gating_logits_out", "topk_ids", "topk_weights"})
91+
.Outputs({"gating_logits_out", "topk_weights", "topk_ids"})
9292
.Attrs({"n_group: int",
9393
"topk_group: int",
9494
"top_k: int",

fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def apply_tp(
313313
"""
314314
gate_out = gate(x.cast("float32"))
315315
if layer.topk_method == "noaux_tc":
316-
_, topk_idx, topk_weights = get_moe_scores(
316+
_, topk_weights, topk_idx = get_moe_scores(
317317
gate_out,
318318
layer.n_group,
319319
layer.topk_group,

fastdeploy/model_executor/layers/linear.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
6161
)
6262

6363
if self.model_format == "torch" and "output_dim" in extra_weight_attrs:
64-
extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
64+
if extra_weight_attrs["output_dim"] is not None:
65+
extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
6566

6667
set_weight_attrs(
6768
layer.weight,

fastdeploy/model_executor/layers/rotary_embedding.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def __call__(self, position_ids):
4343
inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
4444
partial_rotary_position_ids = position_ids / self.partial_rotary_factor
4545
freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq)
46-
if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"):
46+
if current_platform.is_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"):
4747
# shape: [B, S, D]
4848
rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32")
4949
emb = paddle.stack([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim))
@@ -89,9 +89,14 @@ def __call__(self, position_ids):
8989
bsz, max_seq_len = position_ids.shape[:2]
9090
inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim)
9191
freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
92-
# shape: [B, S, D/2]
93-
rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32")
94-
emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2))
92+
if current_platform.is_xpu():
93+
# shape: [B, S, D]
94+
rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32")
95+
emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim))
96+
else:
97+
# shape: [B, S, D/2]
98+
rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32")
99+
emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2))
95100
# shape: [B, S, 1, D]
96101
emb = paddle.unsqueeze(emb, 2)
97102
rot_emb[0] = paddle.cos(emb)

fastdeploy/model_executor/models/glm4_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def __init__(
7272
fd_config=fd_config,
7373
prefix=f"{prefix}.up_gate_proj",
7474
input_size=fd_config.model_config.hidden_size,
75-
output_size=[intermediate_size, intermediate_size],
75+
output_sizes=[intermediate_size, intermediate_size],
7676
with_bias=False,
7777
)
7878

fastdeploy/worker/xpu_model_runner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,7 @@ def _init_share_inputs(self, max_num_seqs: int):
990990
position_ids=tmp_position_ids,
991991
base=self.model_config.rope_theta,
992992
model_config=self.model_config,
993+
partial_rotary_factor=self.model_config.partial_rotary_factor,
993994
)
994995

995996
# Set block tables

0 commit comments

Comments
 (0)