NVIDIA
diff --git a/‎examples/vllm_serve/vllm_reload_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/vllm_serve/vllm_reload_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎modelopt/torch/kernels/common/attention/__init__.py‎
Lines changed: 14 additions & 18 deletions b/‎modelopt/torch/kernels/common/attention/__init__.py‎
Lines changed: 14 additions & 18 deletions
diff --git a/‎modelopt/torch/kernels/common/attention/hf_triton_attention.py‎
Lines changed: 116 additions & 1 deletion b/‎modelopt/torch/kernels/common/attention/hf_triton_attention.py‎
Lines changed: 116 additions & 1 deletion
diff --git a/‎modelopt/torch/kernels/common/attention/triton_fa.py‎
Lines changed: 69 additions & 0 deletions b/‎modelopt/torch/kernels/common/attention/triton_fa.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎modelopt/torch/kernels/quantization/attention/__init__.py‎
Lines changed: 9 additions & 1 deletion b/‎modelopt/torch/kernels/quantization/attention/__init__.py‎
Lines changed: 9 additions & 1 deletion
@@ -89,8 +89,8 @@ def _convert_key_for_vllm(key: str, value: Any) -> tuple[str, str | None, Any]:
     if "quantizer" not in key:
         return ("copy", key, value)
 
-    # Skip softmax_quantizer and lm_head quantizers (not needed in vLLM).
-    if "softmax_quantizer" in key or (key.startswith("lm_head.") and "quantizer" in key):
+    # Skip p_bmm_quantizer (softmax-P) and lm_head quantizers (not needed in vLLM).
+    if "p_bmm_quantizer" in key or (key.startswith("lm_head.") and "quantizer" in key):
         return ("skip", None, None)
 
     # Check if this is a q/k/v projection that needs merging
 
@@ -15,14 +15,17 @@
 
 """Shared Triton kernels for modelopt (attention, quantization, etc.)."""
 
+from collections.abc import Callable
+
 import torch
 
 from modelopt.torch.utils import import_plugin
 
 IS_AVAILABLE = False
-attention = None
-attention_calibrate = None
-register_triton_attention = None
+attention: Callable | None = None
+register_triton_attention: Callable | None = None
+triton_attention_forward: Callable | None = None
+validate_triton_attention_envelope: Callable | None = None
 
 if torch.cuda.is_available():
     with import_plugin(
@@ -32,26 +35,19 @@
             "kernel. Try to install triton with `pip install triton`."
         ),
     ):
-        from .triton_fa import attention as _attention
-
-        attention = _attention
-        IS_AVAILABLE = True
-        from .hf_triton_attention import register_triton_attention as _register_triton_attention
-
-        register_triton_attention = _register_triton_attention
-
-        # Calibration lives in the sparsity subpackage (skip-softmax specific).
-        # Imported here so ``from modelopt.torch.kernels.common.attention import
-        # attention_calibrate`` keeps working.
-        from modelopt.torch.kernels.sparsity.attention.calibrate import (
-            attention_calibrate as _attention_calibrate,
+        from .hf_triton_attention import (
+            register_triton_attention,
+            triton_attention_forward,
+            validate_triton_attention_envelope,
         )
+        from .triton_fa import attention
 
-        attention_calibrate = _attention_calibrate
+        IS_AVAILABLE = True
 
 __all__ = [
     "IS_AVAILABLE",
     "attention",
-    "attention_calibrate",
     "register_triton_attention",
+    "triton_attention_forward",
+    "validate_triton_attention_envelope",
 ]
@@ -50,6 +50,107 @@ def _seq_lens_from_mask(
     return None, False
 
 
+def _check_mask_supported(attention_mask: torch.Tensor | None, seq_q: int) -> None:
+    """Reject attention masks this wrapper would silently misread.
+
+    The wrapper only derives right-padded per-sequence lengths from 2D
+    ``[batch, q_len]`` masks; anything else either loses padding info (4D
+    masks) or corrupts the varlen metadata (FA2-style ``[batch, kv_len]``
+    masks during cached decode).
+    """
+
+    def _unsupported(reason):
+        return NotImplementedError(
+            f"The ModelOpt Triton attention kernel does not support {reason}. "
+            "Use unpadded (or uniform-length) right-padded inputs."
+        )
+
+    if attention_mask is None:
+        return
+    if attention_mask.dim() == 2:
+        if attention_mask.shape[1] != seq_q:
+            # FA2-style [batch, kv_len] mask during cached decode: the wrapper
+            # would misread KV lengths as query lengths (out-of-bounds access).
+            raise _unsupported("padded batches during cached decode")
+        mask_bool = attention_mask.to(torch.bool)
+        if not mask_bool[:, 0].all():
+            raise _unsupported("left-padded inputs")
+        # ``_seq_lens_from_mask`` derives lengths via ``sum(dim=1)``, which is only
+        # correct when each row is a contiguous run of valid tokens followed by
+        # padding. A hole (e.g. ``[1, 0, 1]``) would sum to the right count but
+        # place the valid tokens at the wrong positions, so reject non-right-padded
+        # masks (any valid token after a pad == row not monotonically non-increasing).
+        if not (mask_bool[:, :-1].int() >= mask_bool[:, 1:].int()).all():
+            raise _unsupported("non-contiguously padded inputs")
+        return
+    # 4D [batch, 1, q, kv] masks are ignored by the wrapper, which is safe only
+    # when they encode pure causal structure (the kernel masks causally itself).
+    # In a causal mask the newest query row sees every position; any masked
+    # entry there means padding, windowing, or a non-causal/bias pattern.
+    last_row = attention_mask[..., -1, :]
+    hidden = ~last_row if attention_mask.dtype == torch.bool else last_row != 0
+    if hidden.any():
+        raise _unsupported("masks carrying padding or non-causal structure")
+
+
+def validate_triton_attention_envelope(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    **kwargs,
+) -> None:
+    """Raise ``NotImplementedError`` for inputs outside this wrapper/kernel envelope.
+
+    These limits do not come from the quantization or sparsity features layered
+    on top — they document what the ``triton_fa`` kernel (causal or single-token
+    decode only; no sliding window, attention sinks, logit softcapping, or
+    dropout; head_dim >= 16) and this wrapper's varlen-metadata derivation
+    (right-padded 2D masks only; no multi-token forwards over a longer KV cache)
+    support. Callers that route arbitrary HF models onto the kernel dynamically
+    (e.g. the quantization plugin's p_bmm_quantizer dispatch) should call this
+    before dispatching, so unsupported models fail loudly instead of silently
+    computing wrong attention. The sparse-attention path predates these checks
+    and does not yet enforce them.
+    """
+    # Mistral-style models pass sliding_window as an interface kwarg instead of
+    # setting it on the attention module, so check both.
+    if getattr(module, "sliding_window", None) or kwargs.get("sliding_window"):
+        raise NotImplementedError(
+            "The ModelOpt Triton attention kernel does not support sliding-window attention layers."
+        )
+    # Semantic attention arguments the kernel does not implement: dropping them
+    # would change the attention math.
+    for name, reason in (("s_aux", "attention sinks"), ("softcap", "logit softcapping")):
+        if kwargs.get(name) is not None:
+            raise NotImplementedError(
+                f"The ModelOpt Triton attention kernel does not support {reason} ('{name}')."
+            )
+    if kwargs.get("is_causal") is False or getattr(module, "is_causal", True) is False:
+        raise NotImplementedError(
+            "The ModelOpt Triton attention kernel does not support non-causal attention."
+        )
+    if kwargs.get("dropout"):
+        raise NotImplementedError(
+            "The ModelOpt Triton attention kernel does not support attention dropout; "
+            "set attention_dropout=0 for training."
+        )
+    if query.shape[-1] < 16:
+        raise NotImplementedError(
+            f"The ModelOpt Triton attention kernel requires head_dim >= 16, got {query.shape[-1]}."
+        )
+    seq_q, seq_k = query.shape[2], key.shape[2]
+    if seq_q > 1 and seq_k != seq_q:
+        # The wrapper only passes K-side varlen metadata for single-token decode;
+        # multi-token forwards over a longer KV cache would mis-index K/V.
+        raise NotImplementedError(
+            "The ModelOpt Triton attention kernel does not support multi-token "
+            "forwards over a longer KV cache (chunked prefill or "
+            "assisted/speculative decoding)."
+        )
+    _check_mask_supported(attention_mask, seq_q)
+
+
 def triton_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -58,6 +159,8 @@ def triton_attention_forward(
     attention_mask: torch.Tensor | None,
     scaling: float,
     dropout: float = 0.0,
+    p_qdq: str | None = None,
+    p_qdq_scale: float | None = None,
     **kwargs,
 ) -> tuple[torch.Tensor, None]:
     """Attention forward compatible with HF AttentionInterface.
@@ -75,6 +178,12 @@ def triton_attention_forward(
             Other formats (e.g. 4D causal masks) are ignored.
         scaling: Softmax scale (e.g. 1/sqrt(head_dim)).
         dropout: Ignored (kernel has no dropout); use 0 for eval.
+        p_qdq: Optional softmax fake quant-dequant mode ("fp8" or
+            "nvfp4") forwarded to the kernel. Not passed by HF dispatch;
+            used by direct callers such as the quantization plugin.
+        p_qdq_scale: Optional per-tensor quantization scale for the
+            softmax qdq; None uses the kernel default of 1.0 (an effective
+            amax of 448 for FP8 / 6 * 448 for NVFP4).
         **kwargs: Reserved for future extensions.
 
     Returns:
@@ -121,7 +230,7 @@ def triton_attention_forward(
         trials = getattr(method, "_threshold_trials", None)
         # Deferred: the package __init__ imports this module, so importing
         # attention_calibrate at module top would be circular.
-        from modelopt.torch.kernels.common.attention import attention_calibrate
+        from modelopt.torch.kernels.sparsity.attention.calibrate import attention_calibrate
 
         if trials and attention_calibrate is not None:
             o, counters = attention_calibrate(q, k, v, **kw, threshold_trials=trials)
@@ -153,6 +262,11 @@ def triton_attention_forward(
         if threshold:
             kw["skip_softmax_threshold"] = threshold
 
+    if p_qdq is not None:
+        kw["p_qdq"] = p_qdq
+        if p_qdq_scale is not None:
+            kw["p_qdq_scale"] = p_qdq_scale
+
     o = attention(q, k, v, **kw)
 
     attn_output = o.view(batch, seq_len, num_heads, head_dim)
@@ -188,4 +302,5 @@ def register_triton_attention() -> bool:
 __all__ = [
     "register_triton_attention",
     "triton_attention_forward",
+    "validate_triton_attention_envelope",
 ]
@@ -42,6 +42,8 @@
 _apply_sparse_nm_to_qk_tile: Any = None
 _is_dense_region: Any = None
 _skip_softmax_decision: Any = None
+_p_qdq_fp8: Any = None
+_p_qdq_nvfp4: Any = None
 
 
 def _load_sparsity_helpers() -> None:
@@ -62,6 +64,20 @@ def _load_sparsity_helpers() -> None:
         _skip_softmax_decision = _skip
 
 
+def _load_p_qdq_helpers() -> None:
+    global _p_qdq_fp8, _p_qdq_nvfp4
+    if _p_qdq_fp8 is None:
+        from modelopt.torch.kernels.quantization.attention.p_qdq import _p_qdq_nvfp4 as _nvfp4
+        from modelopt.torch.kernels.quantization.common.fp8_quant import fp8_scalar_qdq as _fp8
+
+        _p_qdq_fp8 = _fp8
+        _p_qdq_nvfp4 = _nvfp4
+
+
+# Maps the public p_qdq option to the kernel's P_QDQ constexpr.
+_P_QDQ_MODES = {None: 0, "fp8": 1, "nvfp4": 2}
+
+
 LOG2E: float = 1.44269504088896
 
 # ---------------------------------------------------------------------------
@@ -246,6 +262,8 @@ def _attn_fwd(
     DENSE_RECENT_TOKENS: tl.constexpr = 64,  # Recent KV tokens kept dense (BLOCK_N-independent)
     APPLY_SKIP_SOFTMAX: tl.constexpr = False,  # Skip KV tiles with negligible scores
     SKIP_THRESHOLD_LOG2: tl.constexpr = 0.0,  # log2(lambda) in the kernel's scaled log2 score space
+    P_QDQ: tl.constexpr = 0,  # Fake quant-dequant of softmax P: 0=off, 1=FP8 E4M3, 2=NVFP4
+    p_qdq_scale=1.0,  # Per-tensor scale for softmax qdq (runtime scalar; amax/448 or amax/(6*448))
     Sparsity_total=None,  # Optional int64 scalar for counting total tiles (atomic)
     Sparsity_skipped=None,  # Optional int64 scalar for counting skipped tiles (atomic)
     MEASURE_SPARSITY: tl.constexpr = False,  # When True, count total/skipped tiles via atomic adds
@@ -383,6 +401,14 @@ def _attn_fwd(
             row_sum = row_sum * correction + l_new
             acc = acc * correction[:, None]
 
+            # --- Optional softmax quant-dequant (emulates quantized P @ V) ---
+            # row_sum keeps the unquantized p: deployment kernels compute the
+            # softmax denominator in fp32 and only feed quantized P to BMM2.
+            if P_QDQ == 1:
+                p = _p_qdq_fp8(p, p_qdq_scale)
+            elif P_QDQ == 2:
+                p = _p_qdq_nvfp4(p, p_qdq_scale, BLOCK_M, BLOCK_N)
+
             # Load V and accumulate
             if IS_PAGED:
                 v = _load_paged_v_tile(
@@ -806,6 +832,8 @@ def forward(
         dense_recent_tokens,
         skip_softmax_threshold,
         measure_sparsity,
+        p_qdq_mode,
+        p_qdq_scale,
         k_cache,
         v_cache,
         block_table,
@@ -903,6 +931,8 @@ def forward(
             "DENSE_RECENT_TOKENS": dense_recent_tokens,
             "APPLY_SKIP_SOFTMAX": apply_skip,
             "SKIP_THRESHOLD_LOG2": skip_threshold_log2,
+            "P_QDQ": p_qdq_mode,
+            "p_qdq_scale": p_qdq_scale,
             "Sparsity_total": sparsity_total,
             "Sparsity_skipped": sparsity_skipped,
             "MEASURE_SPARSITY": do_measure,
@@ -1106,6 +1136,8 @@ def backward(ctx, grad_output):
             None,  # dense_recent_tokens
             None,  # skip_softmax_threshold
             None,  # measure_sparsity
+            None,  # p_qdq_mode
+            None,  # p_qdq_scale
             None,  # k_cache
             None,  # v_cache
             None,  # block_table
@@ -1132,6 +1164,8 @@ def attention(
     dense_recent_tokens: int = 64,
     skip_softmax_threshold: float | None = None,
     measure_sparsity: bool = False,
+    p_qdq: str | None = None,
+    p_qdq_scale: float = 1.0,
     k_cache: torch.Tensor | None = None,
     v_cache: torch.Tensor | None = None,
     block_table: torch.Tensor | None = None,
@@ -1169,6 +1203,26 @@ def attention(
             and skipped tiles via atomic counters. The counts are stored as
             ``_sparsity_total`` and ``_sparsity_skipped`` attributes on the
             returned output tensor.
+        p_qdq: Fake quant-dequant of the softmax probabilities ``P``
+            before the ``P @ V`` matmul (BMM2), emulating quantized attention.
+            ``"fp8"`` round-trips P through FP8 E4M3 with a static per-tensor
+            scale (amax = 1.0, exact since the kernel's unnormalized P is in
+            [0, 1]). ``"nvfp4"`` applies the two-level NVFP4 recipe: E2M1
+            elements with one FP8 E4M3 scale per 16 elements along the key
+            dimension (the BMM2 contraction axis; every autotuned BLOCK_N is
+            a multiple of 16). The softmax denominator stays unquantized, as
+            in deployment kernels. The backward pass uses the straight-through
+            estimator: gradients are computed from the unquantized P, matching
+            QAT references that keep the backward dots in high precision.
+            Set to ``None`` to disable.
+        p_qdq_scale: Per-tensor quantization scale for the softmax qdq
+            (standard convention ``q = cast(p / scale) * scale``). For FP8
+            this is ``amax / 448``; for NVFP4 it is the global scale
+            ``amax / (6 * 448)``. The default of 1.0 corresponds to an
+            effective amax of 448 (FP8) or 6 * 448 (NVFP4) — a direct cast
+            of the kernel's unnormalized P in [0, 1]. A runtime scalar —
+            user-set or calibrated values do not recompile the kernel.
+            Out-of-range values saturate.
         k_cache: Paged K cache [num_blocks, page_size, num_kv_heads, head_dim].
             When provided, K/V are read from paged cache via block_table
             instead of from contiguous k/v tensors.
@@ -1186,7 +1240,20 @@ def attention(
         require grad, because the saved ``k``/``v`` are dummy tensors in paged
         mode and dK/dV would be silently incorrect.
     """
+    # Both loaders must run unconditionally: Triton computes a kernel's
+    # dependency hash once, on the first call, walking the full AST. If the
+    # qdq helpers were still None at that point, their source would be
+    # permanently excluded from the cache key and later edits to them would
+    # silently reuse stale compiled kernels from the on-disk cache.
     _load_sparsity_helpers()
+    _load_p_qdq_helpers()
+    if p_qdq not in _P_QDQ_MODES:
+        raise ValueError(
+            f"p_qdq must be one of {sorted(k for k in _P_QDQ_MODES if k)} or None, got {p_qdq!r}"
+        )
+    p_qdq_mode = _P_QDQ_MODES[p_qdq]
+    if p_qdq_mode and not (math.isfinite(p_qdq_scale) and p_qdq_scale > 0):
+        raise ValueError(f"p_qdq_scale must be a finite positive value, got {p_qdq_scale}")
     sm_scale = 1.0 / (q.shape[2] ** 0.5) if softmax_scale is None else softmax_scale
     return _Attention.apply(
         q,
@@ -1206,6 +1273,8 @@ def attention(
         dense_recent_tokens,
         skip_softmax_threshold,
         measure_sparsity,
+        p_qdq_mode,
+        p_qdq_scale,
         k_cache,
         v_cache,
         block_table,
 
@@ -13,4 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Quantization-specific attention kernel pieces (placeholder for combined sparse+quant path)."""
+"""Quantization-specific attention kernel pieces.
+
+``p_qdq.py`` holds the softmax-P (``p_bmm_quantizer``) quant-dequant
+``@triton.jit`` helpers invoked by the unified flash-attention kernel in
+``common/attention/triton_fa.py`` under its ``P_QDQ`` constexpr guard.
+Only NVFP4 needs a P-specific helper (tiling and block-amax policy on top of
+``quantization/gemm/nvfp4_quant.py``); the FP8 mode uses
+``quantization/common/fp8_quant.fp8_scalar_qdq`` directly.
+"""