refactor: relocate tied-weight helpers per Edward's review

juhi10071998 · claude · juhi10071998 · commit b5f181ec429d · 2026-06-17T01:45:27.000Z
Move three private helpers out of unified_export_hf.py into the more
specific util modules suggested in the round-2 review:

- sync_tied_input_amax → modelopt/torch/export/quant_utils.py
  Quantizer-amax merge helper, fits the file's "Utils for quantization
  including scaling factors adjustments" charter (sibling to
  adjust_attn_amax_values, get_*_scaling_factor, etc.). Uses the
  file's existing `from warnings import warn` rather than a new
  warnings import.

- _collect_canonical_tied_patterns → modelopt/torch/export/model_utils.py
- _reorder_canonical_first       → modelopt/torch/export/model_utils.py
  Both are model-introspection helpers that walk the model graph and
  read HF's _tied_weights_keys declaration, which fits the file's
  "model type detection and classification" charter (sibling to
  get_model_type, is_multimodal_model). Added `import re` to support
  the regex compilation in _collect_canonical_tied_patterns.

unified_export_hf.py now imports the three helpers from their new
homes and uses them unchanged. No behavioral change.

tests/unit/torch/export/test_unified_export_hf.py updates its imports
to pull the three helpers from the new locations.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Juhi Mittal &lt;juhim@nvidia.com&gt;
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
@@ -14,6 +14,8 @@
 # limitations under the License.
 """Utility functions for model type detection and classification."""
 
+import re
+
 import torch.nn as nn
 
 MODEL_NAME_TO_TYPE = {
@@ -160,3 +162,84 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
 
     # Pattern 4: No language_model found
     return None
+
+
+def _collect_canonical_tied_patterns(
+    model: nn.Module,
+) -> tuple[list[re.Pattern], list[str]]:
+    """Walk the model and collect canonical-side tied-weight matchers.
+
+    Patterns are submodule-prefixed regexes from each module's
+    ``_tied_weights_keys`` dict-style declaration (the prefix matters
+    for nested models where the dict lives on an inner submodule).
+    Side substrings are dot-separated tokens that appear only on the
+    canonical side of those declarations — needed because modelopt's
+    per-expert unpacking creates post-export keys (e.g.
+    ``…experts.Y.gate_proj.input_scale``) that HF's regexes never knew
+    about. List-style (legacy) declarations are skipped.
+    """
+    patterns: list[re.Pattern] = []
+    alias_token_set: set[str] = set()
+    canonical_token_set: set[str] = set()
+
+    def _tokens(s: str) -> set[str]:
+        """Identifiers in a regex string, with regex specials as separators."""
+        return {tok for tok in re.split(r"[^A-Za-z0-9_]+", s) if tok}
+
+    for name, submodule in model.named_modules():
+        tied = getattr(submodule, "_tied_weights_keys", None)
+        if not isinstance(tied, dict) or not tied:
+            continue
+        prefix = f"{name}." if name else ""
+        for alias_pat, canonical_pat in tied.items():
+            patterns.append(re.compile(prefix + canonical_pat))
+            alias_token_set.update(_tokens(prefix + alias_pat))
+            canonical_token_set.update(_tokens(prefix + canonical_pat))
+
+    # Tokens unique to the canonical side become substring matchers.
+    side_substrings = sorted(canonical_token_set - alias_token_set)
+    return patterns, side_substrings
+
+
+def _reorder_canonical_first(state_dict: dict, model: nn.Module) -> dict:
+    r"""Reorder ``state_dict`` so canonical-side tied keys iterate first.
+
+    Lets the downstream first-wins data_ptr dedup keep canonical names.
+    Uses both regex patterns and substring matchers from
+    :func:`_collect_canonical_tied_patterns`. Gated on the model class
+    name to scope the reorder to DiffusionGemma; other tied
+    encoder-decoder models that ship dict-style ``_tied_weights_keys``
+    can be added to the allowlist here. Mirrors the ``model_type``
+    dispatch used for the Whisper and Nemotron-VL branches elsewhere
+    in ``unified_export_hf.py``.
+    """
+    model_type = type(model).__name__.lower()
+    if "diffusiongemma" not in model_type and "diffusion_gemma" not in model_type:
+        return state_dict
+
+    canonical_patterns, side_substrings = _collect_canonical_tied_patterns(model)
+    if not canonical_patterns and not side_substrings:
+        return state_dict
+
+    def _has_side_substring(key: str) -> bool:
+        # Require the token to appear as a proper dot-separated path
+        # component, not just as a substring of an unrelated identifier.
+        for tok in side_substrings:
+            if (
+                f".{tok}." in key
+                or key.startswith(f"{tok}.")
+                or key.endswith(f".{tok}")
+                or key == tok
+            ):
+                return True
+        return False
+
+    head: dict = {}
+    tail: dict = {}
+    for k, v in state_dict.items():
+        if any(p.search(k) for p in canonical_patterns) or _has_side_substring(k):
+            head[k] = v
+        else:
+            tail[k] = v
+    head.update(tail)
+    return head
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -1479,3 +1479,79 @@ def has_quantized_modules(model: nn.Module) -> bool:
         get_quantization_format(sub_module) != QUANTIZATION_NONE
         for _, sub_module in model.named_modules()
     )
+
+
+def sync_tied_input_amax(model: nn.Module) -> int:
+    """Max-merge input_quantizer amaxes across modules sharing a weight ``data_ptr``.
+
+    Mutates ``model`` in place: overwrites the ``.amax`` buffer on every
+    affected ``input_quantizer`` with the per-group maximum. Intended to
+    run as part of an export pipeline that already replaces weights with
+    packed bytes downstream — i.e. the model is not expected to be reused
+    after this helper runs.
+
+    Closes the loop on ``input_scale`` for HF-tied modules whose forward
+    paths see different activation distributions (encoder vs decoder in
+    YOCO-style models). Must run BEFORE per-module export so the merged
+    amax flows into ``input_scale`` derivation. Handles both dense
+    Linears (keyed by ``weight.data_ptr()``) and fused MoE (keyed by
+    ``(gate_up_proj, down_proj)`` data_ptr tuple). Returns the number of
+    tied groups merged.
+    """
+    from collections import defaultdict
+
+    by_dp: dict = defaultdict(list)
+    for _, m in model.named_modules():
+        # Fused MoE: 3-D source tensors with shared input quantizers
+        if (
+            hasattr(m, "gate_up_proj_input_quantizer")
+            and hasattr(m, "gate_up_proj")
+            and hasattr(m, "down_proj")
+            and m.gate_up_proj.dim() == 3
+        ):
+            key = ("moe", m.gate_up_proj.data_ptr(), m.down_proj.data_ptr())
+            by_dp[key].append(m)
+        # Dense quantized Linear with an input_quantizer
+        elif (
+            hasattr(m, "input_quantizer")
+            and hasattr(m, "weight")
+            and isinstance(m.weight, torch.nn.Parameter)
+        ):
+            by_dp[("dense", m.weight.data_ptr())].append(m)
+
+    def _merge(quantizers: list) -> bool:
+        """Max-merge amaxes across the quantizer list. Returns True on merge."""
+        valid = [
+            q
+            for q in quantizers
+            if q is not None
+            and getattr(q, "is_enabled", False)
+            and getattr(q, "_amax", None) is not None
+            and not q._amax.is_meta
+        ]
+        if len(valid) < 2:
+            return False
+        # Require scalar (per-tensor) amax — matches preprocess_linear_fusion.
+        if any(q._amax.numel() != 1 for q in valid):
+            warn(
+                "sync_tied_input_amax: non-scalar input_quantizer amax encountered "
+                "in a tied group; skipping. Only per-tensor input quantizers are "
+                "supported for tied-modules merging."
+            )
+            return False
+        merged = torch.max(torch.stack([q.amax for q in valid]))
+        for q in valid:
+            q.amax = merged.clone()
+        return True
+
+    synced = 0
+    for key, modules in by_dp.items():
+        if len(modules) < 2:
+            continue
+        if key[0] == "moe":
+            for q_name in ("gate_up_proj_input_quantizer", "down_proj_input_quantizer"):
+                if _merge([getattr(m, q_name, None) for m in modules]):
+                    synced += 1
+        elif _merge([m.input_quantizer for m in modules]):
+            synced += 1
+    return synced
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -88,7 +88,7 @@
     QUANTIZATION_W4A8_NVFP4_FP8,
     QUANTIZATION_W4A16_NVFP4,
 )
-from .model_utils import get_language_model_from_vl, is_multimodal_model
+from .model_utils import _reorder_canonical_first, get_language_model_from_vl, is_multimodal_model
 from .moe_utils import _export_fused_experts
 from .plugins import SpeculativeDecodingExporter, has_spec_opt
 from .quant_utils import (
@@ -104,6 +104,7 @@
     maybe_transpose_expert_weight_dimensions,
     postprocess_state_dict,
     preprocess_linear_fusion,
+    sync_tied_input_amax,
     to_quantized_weight,
 )
 
@@ -754,163 +755,6 @@ def _export_quantized_weight(
     torch.cuda.empty_cache()
 
 
-def _collect_canonical_tied_patterns(
-    model: nn.Module,
-) -> tuple[list[re.Pattern], list[str]]:
-    """Walk the model and collect canonical-side tied-weight matchers.
-
-    Patterns are submodule-prefixed regexes from each module's
-    ``_tied_weights_keys`` dict-style declaration (the prefix matters
-    for nested models where the dict lives on an inner submodule).
-    Side substrings are dot-separated tokens that appear only on the
-    canonical side of those declarations — needed because modelopt's
-    per-expert unpacking creates post-export keys (e.g.
-    ``…experts.Y.gate_proj.input_scale``) that HF's regexes never knew
-    about. List-style (legacy) declarations are skipped.
-    """
-    patterns: list[re.Pattern] = []
-    alias_token_set: set[str] = set()
-    canonical_token_set: set[str] = set()
-
-    def _tokens(s: str) -> set[str]:
-        """Identifiers in a regex string, with regex specials as separators."""
-        return {tok for tok in re.split(r"[^A-Za-z0-9_]+", s) if tok}
-
-    for name, submodule in model.named_modules():
-        tied = getattr(submodule, "_tied_weights_keys", None)
-        if not isinstance(tied, dict) or not tied:
-            continue
-        prefix = f"{name}." if name else ""
-        for alias_pat, canonical_pat in tied.items():
-            patterns.append(re.compile(prefix + canonical_pat))
-            alias_token_set.update(_tokens(prefix + alias_pat))
-            canonical_token_set.update(_tokens(prefix + canonical_pat))
-
-    # Tokens unique to the canonical side become substring matchers.
-    side_substrings = sorted(canonical_token_set - alias_token_set)
-    return patterns, side_substrings
-
-
-def _reorder_canonical_first(state_dict: dict, model: nn.Module) -> dict:
-    r"""Reorder ``state_dict`` so canonical-side tied keys iterate first.
-
-    Lets the downstream first-wins data_ptr dedup keep canonical names.
-    Uses both regex patterns and substring matchers from
-    :func:`_collect_canonical_tied_patterns`. Gated on the model class
-    name to scope the reorder to DiffusionGemma; other tied
-    encoder-decoder models that ship dict-style ``_tied_weights_keys``
-    can be added to the allowlist here. Mirrors the ``model_type``
-    dispatch used for the Whisper and Nemotron-VL branches elsewhere
-    in this file.
-    """
-    model_type = type(model).__name__.lower()
-    if "diffusiongemma" not in model_type and "diffusion_gemma" not in model_type:
-        return state_dict
-
-    canonical_patterns, side_substrings = _collect_canonical_tied_patterns(model)
-    if not canonical_patterns and not side_substrings:
-        return state_dict
-
-    def _has_side_substring(key: str) -> bool:
-        # Require the token to appear as a proper dot-separated path
-        # component, not just as a substring of an unrelated identifier.
-        for tok in side_substrings:
-            if (
-                f".{tok}." in key
-                or key.startswith(f"{tok}.")
-                or key.endswith(f".{tok}")
-                or key == tok
-            ):
-                return True
-        return False
-
-    head: dict = {}
-    tail: dict = {}
-    for k, v in state_dict.items():
-        if any(p.search(k) for p in canonical_patterns) or _has_side_substring(k):
-            head[k] = v
-        else:
-            tail[k] = v
-    head.update(tail)
-    return head
-
-
-def sync_tied_input_amax(model: nn.Module) -> int:
-    """Max-merge input_quantizer amaxes across modules sharing a weight ``data_ptr``.
-
-    Mutates ``model`` in place: overwrites the ``.amax`` buffer on every
-    affected ``input_quantizer`` with the per-group maximum. Intended to
-    run as part of an export pipeline that already replaces weights with
-    packed bytes downstream — i.e. the model is not expected to be reused
-    after this helper runs.
-
-    Closes the loop on ``input_scale`` for HF-tied modules whose forward
-    paths see different activation distributions (encoder vs decoder in
-    YOCO-style models). Must run BEFORE per-module export so the merged
-    amax flows into ``input_scale`` derivation. Handles both dense
-    Linears (keyed by ``weight.data_ptr()``) and fused MoE (keyed by
-    ``(gate_up_proj, down_proj)`` data_ptr tuple). Returns the number of
-    tied groups merged.
-    """
-    from collections import defaultdict
-
-    by_dp: dict = defaultdict(list)
-    for _, m in model.named_modules():
-        # Fused MoE: 3-D source tensors with shared input quantizers
-        if (
-            hasattr(m, "gate_up_proj_input_quantizer")
-            and hasattr(m, "gate_up_proj")
-            and hasattr(m, "down_proj")
-            and m.gate_up_proj.dim() == 3
-        ):
-            key = ("moe", m.gate_up_proj.data_ptr(), m.down_proj.data_ptr())
-            by_dp[key].append(m)
-        # Dense quantized Linear with an input_quantizer
-        elif (
-            hasattr(m, "input_quantizer")
-            and hasattr(m, "weight")
-            and isinstance(m.weight, torch.nn.Parameter)
-        ):
-            by_dp[("dense", m.weight.data_ptr())].append(m)
-
-    def _merge(quantizers: list) -> bool:
-        """Max-merge amaxes across the quantizer list. Returns True on merge."""
-        valid = [
-            q
-            for q in quantizers
-            if q is not None
-            and getattr(q, "is_enabled", False)
-            and getattr(q, "_amax", None) is not None
-            and not q._amax.is_meta
-        ]
-        if len(valid) < 2:
-            return False
-        # Require scalar (per-tensor) amax — matches preprocess_linear_fusion.
-        if any(q._amax.numel() != 1 for q in valid):
-            warnings.warn(
-                "sync_tied_input_amax: non-scalar input_quantizer amax encountered "
-                "in a tied group; skipping. Only per-tensor input quantizers are "
-                "supported for tied-modules merging."
-            )
-            return False
-        merged = torch.max(torch.stack([q.amax for q in valid]))
-        for q in valid:
-            q.amax = merged.clone()
-        return True
-
-    synced = 0
-    for key, modules in by_dp.items():
-        if len(modules) < 2:
-            continue
-        if key[0] == "moe":
-            for q_name in ("gate_up_proj_input_quantizer", "down_proj_input_quantizer"):
-                if _merge([getattr(m, q_name, None) for m in modules]):
-                    synced += 1
-        elif _merge([m.input_quantizer for m in modules]):
-            synced += 1
-    return synced
-
-
 def _process_quantized_modules(
     model: nn.Module,
     dtype: torch.dtype,
diff --git a/tests/unit/torch/export/test_unified_export_hf.py b/tests/unit/torch/export/test_unified_export_hf.py
@@ -24,12 +24,12 @@
 )
 
 import modelopt.torch.quantization as mtq
-from modelopt.torch.export.unified_export_hf import (
+from modelopt.torch.export.model_utils import (
     _collect_canonical_tied_patterns,
-    _export_quantized_weight,
     _reorder_canonical_first,
-    sync_tied_input_amax,
 )
+from modelopt.torch.export.quant_utils import sync_tied_input_amax
+from modelopt.torch.export.unified_export_hf import _export_quantized_weight
 
 
 def test_collect_canonical_tied_patterns_dict_style():

Original file line number	Diff line number	Diff line change
`@@ -24,12 +24,12 @@`
`24`	`24`	`)`
`25`	`25`
`26`	`26`	`import modelopt.torch.quantization as mtq`
`27`		`-from modelopt.torch.export.unified_export_hf import (`
	`27`	`+from modelopt.torch.export.model_utils import (`
`28`	`28`	`_collect_canonical_tied_patterns,`
`29`		`- _export_quantized_weight,`
`30`	`29`	`_reorder_canonical_first,`
`31`		`- sync_tied_input_amax,`
`32`	`30`	`)`
	`31`	`+from modelopt.torch.export.quant_utils import sync_tied_input_amax`
	`32`	`+from modelopt.torch.export.unified_export_hf import _export_quantized_weight`
`33`	`33`
`34`	`34`
`35`	`35`	`def test_collect_canonical_tied_patterns_dict_style():`