pytorch · lucylq · Nov 22, 2025
@@ -20,9 +20,15 @@
 )
 from executorch.backends.xnnpack.test.tester import Quantize as XNNPackQuantize, Tester
 from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
+
+from executorch.exir import ExecutorchProgramManager
+from executorch.exir._serialize import _deserialize_pte_binary
 from executorch.exir.passes.external_constants_pass import (
     delegate_external_constants_pass_unlifted,
 )
+from executorch.extension.flat_tensor.serialize.serialize import (
+    _deserialize_to_flat_tensor,
+)
 
 from torchao.quantization.granularity import PerGroup
 from torchao.quantization.quant_api import Int8DynamicActivationIntxWeightConfig
@@ -87,7 +93,7 @@ def _test_linear(
         self,
         partitioner: XnnpackPartitioner,
         quantization_stage: Union[BaseStages.Quantize, BaseStages.Quantize_],
-    ):
+    ) -> ExecutorchProgramManager:
         eager_model = self.ModuleLinear(
             in_size=1,
             input_channels=32,
@@ -106,7 +112,7 @@ def _test_linear(
         exec = tester.get_artifact()
         program_buffer = exec.buffer
         self.assertEqual(len(exec._tensor_data), 1)
-        data_buffer = bytes(exec._tensor_data.pop("model"))
+        data_buffer = bytes(exec._tensor_data["model"])
         self.assertTrue(len(data_buffer) > 200)
         from executorch.extension.pybindings import portable_lib as runtime
 
@@ -122,6 +128,8 @@ def _test_linear(
         #         test_inputs
         #     )
 
+        return exec
+
     def test_quantize_(self):
         # Quantize with torchao quantize_ API.
         DynamicallyQuantizedPartitioner = XnnpackPartitioner(
@@ -132,9 +140,16 @@ def test_quantize_(self):
             weight_dtype=torch.int4,
             weight_granularity=PerGroup(32),
         )
-        self._test_linear(
+        exec = self._test_linear(
             DynamicallyQuantizedPartitioner, BaseStages.Quantize_(config=linear_config)
         )
+        # PTE file has no named data.
+        pte_file = _deserialize_pte_binary(exec.buffer)
+        self.assertEqual(pte_file.named_data, None)
+
+        # PTD file contains quantized weight and scale.
+        ptd_file = _deserialize_to_flat_tensor(bytes(exec._tensor_data["model"]))
+        self.assertEqual(len(ptd_file.named_data), 2)
 
     def test_pt2e_quantize(self):
         # Quantize with pt2e quantize.
@@ -156,6 +171,15 @@ def test_pt2e_quantize(self):
                 partitioner = XnnpackPartitioner(
                     config_precisions=precision, per_op_mode=per_op_mode
                 )
-                self._test_linear(
+                exec = self._test_linear(
                     partitioner, XNNPackQuantize(quantization_config=quant_config)
                 )
+                # PTE file has no named data.
+                pte_file = _deserialize_pte_binary(exec.buffer)
+                self.assertEqual(pte_file.named_data, None)
+
+                # PTD file contains quantized weight, and potentially scale.
+                ptd_file = _deserialize_to_flat_tensor(
+                    bytes(exec._tensor_data["model"])
+                )
+                self.assertTrue(len(ptd_file.named_data) >= 1)
@@ -194,9 +194,6 @@ def filter_fn(m, fqn):
             ),
             filter_fn=filter_fn,
         )
-
-        model = unwrap_tensor_subclass(model)
-
         # TODO: deal with checkpoint / computation dtype decoupling.
 
         if verbose:

@@ -38,7 +38,6 @@
 from torch.nn.attention import SDPBackend
 from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torchao.quantization.pt2e.quantizer import ComposableQuantizer, Quantizer
-from torchao.utils import unwrap_tensor_subclass
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -203,11 +202,6 @@ def _get_edge_config(self) -> EdgeCompileConfig:
         return edge_config
 
     def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
-        if module is not None:
-            unwrap_tensor_subclass(module)
-        else:
-            unwrap_tensor_subclass(self.model)
-
         dynamic_shape = self._get_dynamic_shape()
         # 1. torch.nn.attention.sdpa_kernel([SDPBackend.MATH]) is for bypassing the dynamo error when tracing
         # 2. torch.no_grad() is for getting rid of the dropout (not sure why training ops will show up)
@@ -226,6 +220,8 @@ def _export(self, module: Optional[torch.nn.Module] = None) -> ExportedProgram:
                 dynamic_shapes=dynamic_shape,
                 strict=True,
             )
+        # Functionalize the graph, and decompose subclasses from torchao quantize.
+        exported_module = exported_module.run_decompositions({})
         return exported_module
 
     def export(self) -> "LLMEdgeManager":