feat(archon): add moe_router_dtype config for FP32 router gate GEMM (#1009)

rchardx · web-flow · commit 4f5a2944f0ac · 2026-03-08T23:06:53.000+08:00
Add configurable FP32 precision for MoE router gate GEMM to improve
numerical stability with large expert counts, using a Megatron-Core-style
custom torch.autograd.Function.

Key changes:
- Add moe_router_dtype field to ArchonEngineConfig (default "fp32")
- Add router_dtype field to MoEArgs dataclass
- Implement RouterGatingLinearFunction with FP32 forward/backward
- Thread config from ArchonEngineConfig through to TokenChoiceTopKRouter
- None means no override (use model dtype), "fp32" runs gate GEMM in float32
- Consolidate test_moe_args.py and test_router_fp32.py into test_moe_common.py
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -542,6 +542,17 @@ class ArchonEngineConfig:
         },
     )
 
+    # MoE
+    moe_router_dtype: str | None = field(
+        default="fp32",
+        metadata={
+            "help": "Data type for MoE router gate GEMM computation. "
+            "'fp32' runs gate linear in float32 for numerical stability. "
+            "None uses model dtype (no override).",
+            "choices": ["fp32", None],
+        },
+    )
+
     def __post_init__(self):
         if self.pp_layers_per_stage is not None and self.pp_layers_per_stage < 1:
             raise ValueError(
@@ -563,6 +574,12 @@ def __post_init__(self):
                 f"reshard_after_forward_policy must be one of {valid_reshard_policies}, "
                 f"got '{self.reshard_after_forward_policy}'"
             )
+        valid_router_dtypes = ("fp32", None)
+        if self.moe_router_dtype not in valid_router_dtypes:
+            raise ValueError(
+                f"moe_router_dtype must be one of {valid_router_dtypes}, "
+                f"got '{self.moe_router_dtype}'"
+            )
 
 
 # These configurations are used by Megatron Bridge to build Megatron models.
diff --git a/areal/experimental/engine/archon_engine.py b/areal/experimental/engine/archon_engine.py
@@ -976,10 +976,15 @@ def _create_model_structure(self) -> nn.Module:
             )
             attn_type = "varlen"
 
+        # Map moe_router_dtype string config to torch.dtype; None means no override
+        router_dtype = (
+            torch.float32 if self.config.archon.moe_router_dtype == "fp32" else None
+        )
         model_args = self.spec.model_args_class.from_hf_config(
             self.model_config,
             is_critic=self.config.is_critic,
             attn_type=attn_type,
+            router_dtype=router_dtype,
         )
         return self.spec.model_class(model_args)
 
diff --git a/areal/experimental/models/archon/moe/args.py b/areal/experimental/models/archon/moe/args.py
@@ -5,6 +5,8 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal
 
+import torch
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
@@ -25,6 +27,8 @@ class MoEArgs:
         route_norm: Whether to normalize routing scores.
         route_scale: Scale factor for routing scores.
         score_before_experts: Whether to apply scores before or after expert computation.
+        router_dtype: Data type for router gate GEMM computation.
+            If None, the model's default dtype is used (no override).
 
         num_expert_groups: Number of expert groups for node-limited routing.
             If None, standard top-k routing is used.
@@ -51,6 +55,7 @@ class MoEArgs:
     route_norm: bool = False
     route_scale: float = 1.0
     score_before_experts: bool = False
+    router_dtype: torch.dtype | None = None
 
     # Node-limited routing (optional)
     num_expert_groups: int | None = None
diff --git a/areal/experimental/models/archon/moe/moe.py b/areal/experimental/models/archon/moe/moe.py
@@ -76,6 +76,7 @@ def __init__(self, moe_args: MoEArgs, dim: int, hidden_dim: int):
             score_func=moe_args.score_func,
             route_norm=moe_args.route_norm,
             route_scale=moe_args.route_scale,
+            router_dtype=moe_args.router_dtype,
             num_expert_groups=moe_args.num_expert_groups,
             num_limited_groups=moe_args.num_limited_groups,
             _debug_force_load_balance=moe_args._debug_force_load_balance,
diff --git a/areal/experimental/models/archon/moe/router.py b/areal/experimental/models/archon/moe/router.py
@@ -2,13 +2,75 @@
 
 from __future__ import annotations
 
-from typing import Literal
+from typing import Any, Literal, cast
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
 
+class RouterGatingLinearFunction(torch.autograd.Function):
+    """Custom autograd function for MoE router gate GEMM in higher precision.
+
+    Performs the gate linear layer (input @ weight.T) in the specified dtype
+    while saving tensors in the original dtype for memory efficiency.
+
+    This is adapted from Megatron-Core's RouterGatingLinearFunction.
+    """
+
+    @staticmethod
+    @torch.amp.custom_fwd(device_type="cuda")
+    def forward(
+        ctx: torch.autograd.function.FunctionCtx,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        router_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        """Forward pass: compute input @ weight.T in router_dtype.
+
+        Saves input and weight in their original dtype (BF16) for memory efficiency.
+        """
+        ctx.save_for_backward(input, weight)
+        cast(Any, ctx).router_dtype = router_dtype
+        return torch.mm(input.to(router_dtype), weight.to(router_dtype).t())
+
+    @staticmethod
+    @torch.amp.custom_bwd(device_type="cuda")
+    def backward(
+        ctx: torch.autograd.function.FunctionCtx,
+        *grad_outputs: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, None]:
+        """Backward pass: compute gradients in router_dtype, return in original dtype."""
+        grad_output = grad_outputs[0]
+        input, weight = cast(Any, ctx).saved_tensors
+        router_dtype = cast(Any, ctx).router_dtype
+        grad_output_fp = grad_output.to(router_dtype)
+        grad_input = grad_output_fp.mm(weight.to(router_dtype)).to(input.dtype)
+        grad_weight = grad_output_fp.t().mm(input.to(router_dtype)).to(weight.dtype)
+        return grad_input, grad_weight, None
+
+
+def router_gating_linear(
+    input: torch.Tensor, weight: torch.Tensor, router_dtype: torch.dtype | None
+) -> torch.Tensor:
+    """Apply router gate linear with optional dtype casting for numerical stability.
+
+    Args:
+        input: Input tensor (num_tokens, dim).
+        weight: Gate weight tensor (num_experts, dim).
+        router_dtype: Dtype to use for GEMM. If None, uses standard F.linear.
+
+    Returns:
+        Output tensor (num_tokens, num_experts).
+    """
+    if router_dtype is not None:
+        return cast(
+            torch.Tensor,
+            RouterGatingLinearFunction.apply(input, weight, router_dtype),
+        )
+    return F.linear(input, weight)
+
+
 class TokenChoiceTopKRouter(nn.Module):
     """Token-choice routing with top-k expert selection.
 
@@ -23,6 +85,7 @@ class TokenChoiceTopKRouter(nn.Module):
         score_func: Scoring function, either "softmax" or "sigmoid".
         route_norm: Whether to normalize routing scores after top-k selection.
         route_scale: Scale factor applied to routing scores.
+        router_dtype: Data type for gate GEMM. If None, uses model dtype (no override).
         num_expert_groups: Number of expert groups for node-limited routing.
             If None, standard top-k routing is used.
         num_limited_groups: Number of groups to select in node-limited routing.
@@ -41,6 +104,7 @@ def __init__(
         score_func: Literal["softmax", "sigmoid"] = "sigmoid",
         route_norm: bool = False,
         route_scale: float = 1.0,
+        router_dtype: torch.dtype | None = None,
         num_expert_groups: int | None = None,
         num_limited_groups: int | None = None,
         _debug_force_load_balance: bool = False,
@@ -52,6 +116,7 @@ def __init__(
         self.score_func = score_func
         self.route_norm = route_norm
         self.route_scale = route_scale
+        self.router_dtype = router_dtype
         self.num_expert_groups = num_expert_groups
         self.num_limited_groups = num_limited_groups
         self._debug_force_load_balance = _debug_force_load_balance
@@ -147,7 +212,7 @@ def forward(
                 - num_tokens_per_expert: Token count per expert, shape (num_experts,).
         """
         # Compute gate scores: (num_tokens, num_experts)
-        scores = self.gate(x)
+        scores = router_gating_linear(x, self.gate.weight, self.router_dtype)
 
         # Apply scoring function in float32 to avoid loss explosion
         if self.score_func == "sigmoid":
diff --git a/areal/experimental/models/archon/qwen3/model/args.py b/areal/experimental/models/archon/qwen3/model/args.py
@@ -70,6 +70,9 @@ def from_hf_config(
             # Override with additional fields from HF config
             if hasattr(hf_config, "num_shared_experts"):
                 moe_args.num_shared_experts = hf_config.num_shared_experts
+            router_dtype = kwargs.get("router_dtype", None)
+            if router_dtype is not None:
+                moe_args.router_dtype = router_dtype
 
         # Get decoder_sparse_step (default to 1 = all MoE layers)
         decoder_sparse_step = getattr(hf_config, "decoder_sparse_step", 1)
diff --git a/docs/en/cli_reference.md b/docs/en/cli_reference.md
@@ -808,6 +808,7 @@ Configuration for Archon Engine training backend.
 | `pp_last_stage_less_layers`    | integer         | `1`                 | Number of layers to reduce in the last pipeline stage. Accounts for output layer overhead.                                                                                                                                                                                                              |
 | `reshard_after_forward_policy` | string          | `"default"`         | FSDP reshard policy after forward pass. 'default': reshard when pipeline parallelism is off; keep unsharded when on to avoid repeated all-gather per microbatch. 'always': always reshard after forward (saves memory). 'never': never reshard after forward. **Choices:** `default`, `always`, `never` |
 | `use_deterministic_algorithms` | boolean         | `False`             | Enable deterministic algorithms for training reproducibility. Sets torch.use_deterministic_algorithms(True, warn_only=True), CUBLAS_WORKSPACE_CONFIG, NCCL_ALGO, and TORCH_COMPILE_DETERMINISTIC. May reduce performance.                                                                               |
+| `moe_router_dtype`             | string \| None  | `"fp32"`            | Data type for MoE router gate GEMM computation. 'fp32' runs gate linear in float32 for numerical stability. None uses model dtype (no override). **Choices:** `fp32`, `None`                                                                                                                            |
 
 (section-distributed-data-parallel)=
 
diff --git a/docs/zh/cli_reference.md b/docs/zh/cli_reference.md
@@ -806,6 +806,7 @@ Configuration for Archon Engine training backend.
 | `pp_last_stage_less_layers`    | integer         | `1`                 | Number of layers to reduce in the last pipeline stage. Accounts for output layer overhead.                                                                                                                                                                                                              |
 | `reshard_after_forward_policy` | string          | `"default"`         | FSDP reshard policy after forward pass. 'default': reshard when pipeline parallelism is off; keep unsharded when on to avoid repeated all-gather per microbatch. 'always': always reshard after forward (saves memory). 'never': never reshard after forward. **Choices:** `default`, `always`, `never` |
 | `use_deterministic_algorithms` | boolean         | `False`             | Enable deterministic algorithms for training reproducibility. Sets torch.use_deterministic_algorithms(True, warn_only=True), CUBLAS_WORKSPACE_CONFIG, NCCL_ALGO, and TORCH_COMPILE_DETERMINISTIC. May reduce performance.                                                                               |
+| `moe_router_dtype`             | string \| None  | `"fp32"`            | Data type for MoE router gate GEMM computation. 'fp32' runs gate linear in float32 for numerical stability. None uses model dtype (no override). **Choices:** `fp32`, `None`                                                                                                                            |
 
 (section-distributed-data-parallel)=
 
diff --git a/tests/experimental/archon/test_moe_args.py b/tests/experimental/archon/test_moe_args.py
diff --git a/tests/experimental/archon/test_moe_common.py b/tests/experimental/archon/test_moe_common.py

Original file line number	Diff line number	Diff line change
`@@ -976,10 +976,15 @@ def _create_model_structure(self) -> nn.Module:`
`976`	`976`	`)`
`977`	`977`	`attn_type = "varlen"`
`978`	`978`
	`979`	`+ # Map moe_router_dtype string config to torch.dtype; None means no override`
	`980`	`+ router_dtype = (`
	`981`	`+ torch.float32 if self.config.archon.moe_router_dtype == "fp32" else None`
	`982`	`+ )`
`979`	`983`	`model_args = self.spec.model_args_class.from_hf_config(`
`980`	`984`	`self.model_config,`
`981`	`985`	`is_critic=self.config.is_critic,`
`982`	`986`	`attn_type=attn_type,`
	`987`	`+ router_dtype=router_dtype,`
`983`	`988`	`)`
`984`	`989`	`return self.spec.model_class(model_args)`
`985`	`990`