[OMNIML-5003] Restrict non-gated detection to single up_proj (review)

jenchen13 · claude · jenchen13 · commit 4ebc3bf83426 · 2026-06-16T14:47:29.000-07:00
Address review feedback:
- _fused_experts_wrapper_class now claims _QuantNonGatedFusedExperts only for a
  3-D up_proj with no gate_proj and no gate_up_proj. A split-gated container
  (separate 3-D gate_proj/up_proj/down_proj, three F.linear calls per expert)
  falls through to None/unsupported instead of being mis-wrapped, since the
  two-call toggle and up_proj-storage index recovery assume exactly two calls.
- Add test_split_gated_layout_not_claimed_as_nongated and
  test_get_quant_config_resolves_nongated_experts (down_proj anchors format /
  has_quantizers detection, so the produced quant config is correct).

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
@@ -1502,8 +1502,8 @@ def _fused_experts_wrapper_class(module):
       ``MixtralExperts``, ``Qwen2MoeExperts``, ``Qwen3MoeExperts``,
       ``Qwen3_5MoeExperts``, ``DeepseekV3NaiveMoe``, ``JambaExperts``,
       ``OlmoeExperts``, etc.
-    * non-gated (``_QuantNonGatedFusedExperts``): a 3-D ``up_proj`` and no
-      ``gate_up_proj``. Matches NemotronH ``NemotronHExperts``.
+    * non-gated (``_QuantNonGatedFusedExperts``): a 3-D ``up_proj`` with no
+      ``gate_proj`` and no ``gate_up_proj``. Matches NemotronH ``NemotronHExperts``.
 
     Returns ``None`` for non-standard layouts (DBRX, GptOss, GraniteMoE,
     Llama4TextExperts) which have their own explicit registrations.
@@ -1518,7 +1518,14 @@ def _fused_experts_wrapper_class(module):
         return _QuantFusedExperts
     up = getattr(module, "up_proj", None)
     if isinstance(up, (nn.Parameter, Tensor)) and up.dim() == 3:
-        return _QuantNonGatedFusedExperts
+        # Strictly single up_proj/down_proj only. A split-gated container with a
+        # separate gate projection (3-D gate_proj or gate_up_proj) makes three
+        # F.linear calls per expert, which would break _QuantNonGatedFusedExperts'
+        # two-call toggle and its up_proj-storage expert-index recovery. Such a
+        # layout is unsupported here (falls through to None) rather than silently
+        # mis-quantizing the wrong projection.
+        if getattr(module, "gate_proj", None) is None and gate_up is None:
+            return _QuantNonGatedFusedExperts
     return None
 
 
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -24,7 +24,7 @@
 
 import modelopt.torch.quantization as mtq
 from modelopt.torch.export.moe_utils import _export_fused_experts
-from modelopt.torch.export.quant_utils import get_quant_config
+from modelopt.torch.export.quant_utils import get_quant_config, get_quantization_format
 from modelopt.torch.quantization.conversion import _normalize_fused_experts_quantizer_name
 from modelopt.torch.quantization.model_calib import local_hessian_calibrate
 from modelopt.torch.quantization.nn import QuantModuleRegistry, TensorQuantizer
@@ -1191,3 +1191,53 @@ def test_enumeration_yields_up_and_down_proj(self):
             assert set(weight_attr_names(converted)) == {"up_proj", "down_proj"}
         finally:
             self._cleanup_registry(expert_type)
+
+    def test_split_gated_layout_not_claimed_as_nongated(self):
+        """A fused container with a separate 3-D gate_proj (split-gated: three
+        F.linear calls per expert) must NOT be claimed by the non-gated wrapper,
+        whose two-call toggle and up_proj-storage index recovery assume exactly
+        two projections. It is left unsupported (None) rather than mis-quantized."""
+
+        class _SplitGatedExperts(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.num_experts = NUM_EXPERTS
+                self.gate_proj = nn.Parameter(
+                    torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.02
+                )
+                self.up_proj = nn.Parameter(
+                    torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.02
+                )
+                self.down_proj = nn.Parameter(
+                    torch.randn(NUM_EXPERTS, HIDDEN_DIM, INTERMEDIATE_DIM) * 0.02
+                )
+                self.act_fn = nn.SiLU()
+
+        module = _SplitGatedExperts()
+        assert _fused_experts_wrapper_class(module) is None
+        assert _is_fused_experts_module(module) is False
+
+    def test_get_quant_config_resolves_nongated_experts(self):
+        """get_quant_config must detect the non-gated experts as quantized. The
+        up_proj weight name does not resolve to its quantizers (they live on the
+        gate_up_proj sentinel list), but down_proj anchors both has_quantizers
+        (down_proj_input_quantizer) and format detection (down_proj_weight_quantizers),
+        so the produced config is correct."""
+        model = _TinyNonGatedMoEModel()
+        expert_type = type(model.moe.experts)
+        self._cleanup_registry(expert_type)
+
+        def forward_loop(m):
+            torch.manual_seed(0)
+            for _ in range(2):
+                m(torch.randn(1, 4, HIDDEN_DIM))
+
+        try:
+            mtq.quantize(model, self._nongated_fp8_cfg(), forward_loop=forward_loop)
+            # Format resolves (via down_proj) instead of QUANTIZATION_NONE (None).
+            assert get_quantization_format(model.moe.experts) is not None
+            # The non-gated experts are reflected in the produced quant config.
+            quant = get_quant_config(model)["quantization"]
+            assert quant.get("quant_algo") is not None
+        finally:
+            self._cleanup_registry(expert_type)