PaddlePaddle · zoooo0820 · Mar 30, 2026 · Mar 30, 2026 · Copilot · Mar 30, 2026
diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -245,16 +245,17 @@ def moe_topk_select(
             probs_for_choice.reshape([seq_length, n_group, -1]).topk(2, axis=-1)[0].sum(axis=-1)
         )  # [seq_len, n_group]
         group_idx = paddle.topk(group_scores, k=topk_group, axis=-1, sorted=True)[1]  # [seq_len, topk_group]
-        group_mask = paddle.zeros_like(group_scores).put_along_axis(
-            group_idx, paddle.to_tensor(1.0, dtype=group_scores.dtype), axis=-1
+        group_mask = paddle.sum(
+            paddle.nn.functional.one_hot(group_idx, num_classes=n_group).cast(group_scores.dtype),
+            axis=1,  # Sum over topk_group dimension -> [seq_len, n_group]
         )
-        group_mask = paddle.sum(
-            paddle.nn.functional.one_hot(group_idx, num_classes=n_group).cast(group_scores.dtype),
-            axis=1,  # Sum over topk_group dimension -> [seq_len, n_group]
-        )
+        # Build group_mask of shape [seq_len, n_group] without materializing a large one-hot tensor
+        group_mask = paddle.zeros([seq_length, n_group], dtype=group_scores.dtype)
+        updates = paddle.ones_like(group_idx, dtype=group_scores.dtype)
+        group_mask = paddle.put_along_axis(group_mask, group_idx, updates, axis=1, reduce="assign")
-        group_mask = paddle.sum(
-            paddle.nn.functional.one_hot(group_idx, num_classes=n_group).cast(group_scores.dtype),
-            axis=1,  # Sum over topk_group dimension -> [seq_len, n_group]
-        )
+        # Build group_mask of shape [seq_len, n_group] without materializing a large one-hot tensor
+        group_mask = paddle.zeros([seq_length, n_group], dtype=group_scores.dtype)
+        updates = paddle.ones_like(group_idx, dtype=group_scores.dtype)
+        group_mask = paddle.put_along_axis(group_mask, group_idx, updates, axis=1, reduce="assign")
         score_mask = (
             group_mask.unsqueeze(-1).expand([seq_length, n_group, n_experts // n_group]).reshape([seq_length, -1])
         )  # [seq_len, n_experts]
         probs_for_choice = probs_for_choice.masked_fill(~score_mask.astype(paddle.bool), float("-inf"))
 
     _, topk_ids = paddle.topk(probs_for_choice, top_k, axis=-1)
-    topk_weights = paddle.take_along_axis(gate_probs, topk_ids, axis=-1)
+    topk_weights = paddle.index_sample(gate_probs, topk_ids)
-    topk_weights = paddle.index_sample(gate_probs, topk_ids)
+    # Use paddle.index_sample with its 2-D [N, M] / [N, K] contract by flattening
+    # all leading dimensions into a single batch dimension, and gather along the
+    # last axis. This is equivalent to take_along_axis(..., axis=-1) but robust
+    # to higher-rank gate_probs/topk_ids.
+    last_expert_dim = gate_probs.shape[-1]
+    flat_batch = paddle.numel(gate_probs) // last_expert_dim
+    gate_probs_2d = gate_probs.reshape([flat_batch, last_expert_dim])
+    topk_last_dim = topk_ids.shape[-1]
+    topk_ids_2d = topk_ids.reshape([flat_batch, topk_last_dim])
+    topk_weights_2d = paddle.index_sample(gate_probs_2d, topk_ids_2d)
+    topk_weights = topk_weights_2d.reshape(topk_ids.shape)
-    topk_weights = paddle.index_sample(gate_probs, topk_ids)
+    # Use paddle.index_sample with its 2-D [N, M] / [N, K] contract by flattening
+    # all leading dimensions into a single batch dimension, and gather along the
+    # last axis. This is equivalent to take_along_axis(..., axis=-1) but robust
+    # to higher-rank gate_probs/topk_ids.
+    last_expert_dim = gate_probs.shape[-1]
+    flat_batch = paddle.numel(gate_probs) // last_expert_dim
+    gate_probs_2d = gate_probs.reshape([flat_batch, last_expert_dim])
+    topk_last_dim = topk_ids.shape[-1]
+    topk_ids_2d = topk_ids.reshape([flat_batch, topk_last_dim])
+    topk_weights_2d = paddle.index_sample(gate_probs_2d, topk_ids_2d)
+    topk_weights = topk_weights_2d.reshape(topk_ids.shape)
 
     # normalize combine weights
     if renormalize: