mxtensor: refactor activation quant to use direct logic (#2806)

vkuzo · web-flow · commit fee314b76820 · 2025-08-20T07:38:31.000-04:00
* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]

* Update

[ghstack-poisoned]
diff --git a/test/prototype/mx_formats/test_mx_tensor.py b/test/prototype/mx_formats/test_mx_tensor.py
@@ -390,6 +390,7 @@ def test_exponent_nan_out(elem_dtype, pack_fp6):
         use_fp4_custom_triton_dequant_kernel,
         MXGemmKernelChoice.EMULATED,
         pack_fp6,
+        None,
     )
     tensor_hp = tensor_mx.to_dtype(torch.float)
     assert torch.all(torch.isnan(tensor_hp.flatten()[0:4]))
diff --git a/torchao/prototype/mx_formats/inference_workflow.py b/torchao/prototype/mx_formats/inference_workflow.py
@@ -6,7 +6,6 @@
 
 import types
 from dataclasses import dataclass
-from typing import Optional
 
 import torch
 
@@ -18,13 +17,12 @@
     _validate_elem_dtype,
     _validate_gemm_kernel_choice,
 )
-from torchao.prototype.mx_formats.mx_tensor import MXTensor
+from torchao.prototype.mx_formats.mx_tensor import MXTensor, QuantizeTensorToMXKwargs
 from torchao.prototype.mx_formats.nvfp4_tensor import (
     NVFP4MMConfig,
     NVFP4Tensor,
     QuantizeTensorToNVFP4Kwargs,
 )
-from torchao.quantization.quant_api import to_linear_activation_quantized
 from torchao.quantization.transform_module import (
     register_quantize_module_handler,
 )
@@ -93,26 +91,6 @@ def _linear_extra_repr(self):
     return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={repr(self.weight)}"
 
 
-def _input_activation_quant_func_mxfp(
-    x: torch.Tensor,
-    activation_dtype: torch.dtype,
-    block_size: int,
-    scale: Optional[torch.Tensor] = None,
-):
-    """ """
-
-    # TODO scale for static quant
-
-    activation = MXTensor.to_mx(
-        x,
-        activation_dtype,
-        block_size=block_size,
-        gemm_kernel_choice=None,  # Get from weight
-        pack_fp6=False,  # TODO
-    )
-    return activation
-
-
 @register_quantize_module_handler(MXFPInferenceConfig)
 def _mx_inference_linear_transform(
     module: torch.nn.Module, config: MXFPInferenceConfig
@@ -121,32 +99,26 @@ def _mx_inference_linear_transform(
     # TODO handle AMD
     assert is_sm_at_least_100(), "MXFP is only supported on sm100 machiens for now"
 
-    activation_dtype = config.activation_dtype
-    weight_dtype = config.weight_dtype
     weight = module.weight
 
     assert weight.dtype == torch.bfloat16, (
         f"Only supporting bf16 out dtype for now, got {weight.dtype}"
     )
+    act_quant_kwargs = QuantizeTensorToMXKwargs(
+        elem_dtype=config.activation_dtype,
+        block_size=config.block_size,
+        gemm_kernel_choice=config.gemm_kernel_choice,
+        pack_fp6=False,
+    )
 
     # Convert weight to MX Tensor
     quantized_weight = MXTensor.to_mx(
         weight,
-        weight_dtype,
+        config.weight_dtype,
         block_size=config.block_size,
         gemm_kernel_choice=config.gemm_kernel_choice,
         pack_fp6=False,  # TODO
-    )
-
-    input_quant_func = _input_activation_quant_func_mxfp
-    input_quant_kwargs = {
-        "block_size": config.block_size,
-        "activation_dtype": activation_dtype,
-        "scale": None,
-    }
-
-    quantized_weight = to_linear_activation_quantized(
-        quantized_weight, input_quant_func, quant_kwargs=input_quant_kwargs
+        act_quant_kwargs=act_quant_kwargs,
     )
 
     module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
@@ -226,7 +198,6 @@ def _nvfp4_inference_linear_transform(
         NVFP4Tensor,
         NVFP4MMConfig,
         MXGemmKernelChoice,
-        _input_activation_quant_func_mxfp,
     ]
 )
 
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -68,6 +68,7 @@ def _to_mxfp8_dim1_kernel_wrapper(
             False,
             gemm_kernel_choice,
             False,
+            None,
         )
         mx_tensor = DTensor.from_local(
             inner,
@@ -87,6 +88,7 @@ def _to_mxfp8_dim1_kernel_wrapper(
             False,
             gemm_kernel_choice,
             False,
+            None,
         )
     return mx_tensor
 
diff --git a/torchao/prototype/mx_formats/mx_ops.py b/torchao/prototype/mx_formats/mx_ops.py
@@ -80,12 +80,26 @@ def _get_gemm_choice(
 
 
 def _addmm_mx_dispatch(
-    a: MXTensor, b: MXTensor, aten_op, bias: Optional[torch.Tensor] = None
+    a: torch.Tensor, b: MXTensor, aten_op, bias: Optional[torch.Tensor] = None
 ) -> torch.Tensor:
     """
     Core implementation shared between mx_mm and mx_addmm.
     The only difference is whether bias is None or not.
     """
+
+    if not isinstance(a, MXTensor):
+        assert b.act_quant_kwargs is not None, "weight-only quant not yet supported"
+        k = b.act_quant_kwargs
+        a = MXTensor.to_mx(
+            a,
+            k.elem_dtype,
+            k.block_size,
+            k.scaling_mode,
+            k.use_fp4_custom_triton_dequant_kernel,
+            k.gemm_kernel_choice,
+            k.pack_fp6,
+        )
+
     gemm_choice = _get_gemm_choice(a._gemm_kernel_choice, b._gemm_kernel_choice)
 
     if gemm_choice in (MXGemmKernelChoice.CUBLAS, MXGemmKernelChoice.CUTLASS):
@@ -148,18 +162,14 @@ def _addmm_mx_dispatch(
 def mx_mm(func, types, args, kwargs):
     a = args[0]
     b = args[1]
-    assert isinstance(a, MXTensor) and isinstance(b, MXTensor)
+    assert isinstance(b, MXTensor)
 
     return _addmm_mx_dispatch(a, b, func)
 
 
 @implements([aten.addmm.default])
 def mx_addmm(func, types, args, kwargs):
-    assert (
-        isinstance(args[0], torch.Tensor)
-        and isinstance(args[1], MXTensor)
-        and isinstance(args[2], MXTensor)
-    )
+    assert isinstance(args[0], torch.Tensor) and isinstance(args[2], MXTensor)
     bias = args[0]
     a = args[1]
     b = args[2]
@@ -179,6 +189,7 @@ def mx_t(func, types, args, kwargs):
         old._use_fp4_custom_triton_dequant_kernel,
         old._gemm_kernel_choice,
         old._pack_fp6,
+        old.act_quant_kwargs,
     )
     return new
 
@@ -223,6 +234,7 @@ def mx_view_op(func, types, args, kwargs):
         args[0]._use_fp4_custom_triton_dequant_kernel,
         args[0]._gemm_kernel_choice,
         args[0]._pack_fp6,
+        args[0].act_quant_kwargs,
     )
 
 
@@ -284,6 +296,7 @@ def mx_slice(func, types, args, kwargs):
             x._use_fp4_custom_triton_dequant_kernel,
             x._gemm_kernel_choice,
             x._pack_fp6,
+            x.act_quant_kwargs,
         ),
     )
 
@@ -338,6 +351,7 @@ def autocast_to_copy(func, types, args, kwargs):
             tensor._use_fp4_custom_triton_dequant_kernel,
             tensor._gemm_kernel_choice,
             tensor._pack_fp6,
+            tensor.act_quant_kwargs,
         )
         return res
 
diff --git a/torchao/prototype/mx_formats/mx_tensor.py b/torchao/prototype/mx_formats/mx_tensor.py
@@ -17,7 +17,8 @@
   * Zeros: N/A
 """
 
-from typing import Union
+from dataclasses import dataclass
+from typing import Optional, Union
 
 import torch
 from torch.distributed._tensor import DTensor
@@ -57,6 +58,9 @@
     triton_f6_e3m2_to_scaled_bf16,
     unpack_uint4,
 )
+from torchao.quantization.quantize_.common import (
+    QuantizeTensorKwargs,
+)
 from torchao.utils import TorchAOBaseTensor
 
 # TODO(later): read from somewhere else?
@@ -68,6 +72,16 @@
 EBITS_F8_E5M2, MBITS_F8_E5M2 = 5, 2
 
 
+@dataclass
+class QuantizeTensorToMXKwargs(QuantizeTensorKwargs):
+    elem_dtype: Union[torch.dtype, str] = torch.float8_e4m3fn
+    block_size: int = 32
+    scaling_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR
+    use_fp4_custom_triton_dequant_kernel: bool = False
+    gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.EMULATED
+    pack_fp6: bool = False
+
+
 def _to_mx_rceil(
     data_hp: torch.Tensor,
     max_abs: torch.Tensor,
@@ -458,6 +472,7 @@ class MXTensor(TorchAOBaseTensor):
         "_use_fp4_custom_triton_dequant_kernel",
         "_gemm_kernel_choice",
         "_pack_fp6",
+        "act_quant_kwargs",
     ]
 
     def __new__(
@@ -470,6 +485,7 @@ def __new__(
         use_fp4_custom_triton_dequant_kernel,
         gemm_kernel_choice,
         pack_fp6,
+        act_quant_kwargs,
     ):
         new_size = qdata.size()
         if elem_dtype == torch.float4_e2m1fn_x2:
@@ -540,11 +556,12 @@ def __new__(
         )
         self._gemm_kernel_choice = gemm_kernel_choice
         self._pack_fp6 = pack_fp6
+        self.act_quant_kwargs = act_quant_kwargs
         return self
 
     def __repr__(self):
         # TODO better elem dtype print for fp4
-        return f"MXTensor: elem_dtype: {self._elem_dtype}, s_e8m0: {self._scale_e8m0}, d: {self.qdata}, d_hp: {self.to_dtype(self._orig_dtype)}"  # noqa: E501
+        return f"MXTensor: elem_dtype: {self._elem_dtype}, s_e8m0: {self._scale_e8m0}, d: {self.qdata}, act_quant_kwargs: {self.act_quant_kwargs}"  # noqa: E501
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
@@ -582,8 +599,10 @@ def to_mx(
         block_size: int = BLOCK_SIZE_DEFAULT,
         scaling_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR,
         use_fp4_custom_triton_dequant_kernel: bool = False,
+        # TODO(future PR): switch default gemm to cublas
         gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.EMULATED,
         pack_fp6: bool = False,
+        act_quant_kwargs: Optional[QuantizeTensorToMXKwargs] = None,
     ):
         scale_e8m0_biased, data_lp = to_mx(
             data_hp, elem_dtype, block_size, scaling_mode, pack_fp6
@@ -601,6 +620,7 @@ def to_mx(
                 use_fp4_custom_triton_dequant_kernel,
                 gemm_kernel_choice,
                 pack_fp6,
+                act_quant_kwargs,
             )
             return DTensor.from_local(
                 inner_mx_tensor,
@@ -619,6 +639,7 @@ def to_mx(
             use_fp4_custom_triton_dequant_kernel,
             gemm_kernel_choice,
             pack_fp6,
+            act_quant_kwargs,
         )
 
     # Do not force the MXTensor type on the returned tensor

Original file line number	Diff line number	Diff line change
`@@ -390,6 +390,7 @@ def test_exponent_nan_out(elem_dtype, pack_fp6):`
`390`	`390`	`use_fp4_custom_triton_dequant_kernel,`
`391`	`391`	`MXGemmKernelChoice.EMULATED,`
`392`	`392`	`pack_fp6,`
	`393`	`+ None,`
`393`	`394`	`)`
`394`	`395`	`tensor_hp = tensor_mx.to_dtype(torch.float)`
`395`	`396`	`assert torch.all(torch.isnan(tensor_hp.flatten()[0:4]))`
Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def _to_mxfp8_dim1_kernel_wrapper(`
`68`	`68`	`False,`
`69`	`69`	`gemm_kernel_choice,`
`70`	`70`	`False,`
	`71`	`+ None,`
`71`	`72`	`)`
`72`	`73`	`mx_tensor = DTensor.from_local(`
`73`	`74`	`inner,`
`@@ -87,6 +88,7 @@ def _to_mxfp8_dim1_kernel_wrapper(`
`87`	`88`	`False,`
`88`	`89`	`gemm_kernel_choice,`
`89`	`90`	`False,`
	`91`	`+ None,`
`90`	`92`	`)`
`91`	`93`	`return mx_tensor`
`92`	`94`