pytorch
diff --git a/‎torchao/experimental/tests/test_coreml_codebook.py
Lines changed: 83 additions & 0 deletions b/‎torchao/experimental/tests/test_coreml_codebook.py
Lines changed: 83 additions & 0 deletions
diff --git a/‎torchao/experimental/tests/test_groupwise_lowbit_weight_lut_quantizer.py
Lines changed: 170 additions & 0 deletions b/‎torchao/experimental/tests/test_groupwise_lowbit_weight_lut_quantizer.py
Lines changed: 170 additions & 0 deletions
diff --git a/‎torchao/prototype/quantization/codebook_coreml/__init__.py
Lines changed: 2 additions & 0 deletions b/‎torchao/prototype/quantization/codebook_coreml/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torchao/prototype/quantization/codebook_coreml/codebook_ops.py
Lines changed: 106 additions & 0 deletions b/‎torchao/prototype/quantization/codebook_coreml/codebook_ops.py
Lines changed: 106 additions & 0 deletions
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+import torch.nn.functional as F
+from parameterized import param, parameterized
+
+from torchao.prototype.quantization.codebook_coreml.codebook_ops import (
+    choose_qparams_and_quantize_codebook_coreml as choose_qparams_and_quantize_codebook_coreml_original,
+)
+from torchao.prototype.quantization.codebook_coreml.codebook_ops import (
+    choose_qparams_and_quantize_codebook_coreml_refactored,
+    dequantize_codebook,
+)
+from torchao.quantization.quant_primitives import (
+    _DTYPE_TO_BIT_WIDTH,
+)
+
+
+class TestCoreMLQuantCompatibility(unittest.TestCase):
+    TEST_CASES = [
+        param(grouping_type="column", group_size=128, tensor_shape=(16, 1024)),
+    ]
+
+    @parameterized.expand(TEST_CASES)
+    def test_functional_equivalence(self, grouping_type, group_size, tensor_shape):
+        input_tensor = torch.randn(tensor_shape, dtype=torch.float32)
+        code_dtype = torch.uint4
+        nbits = _DTYPE_TO_BIT_WIDTH[code_dtype]
+        torch.manual_seed(42)
+
+        # --- Get results from reference implementations ---
+        block_size = [-1, group_size]
+        expected_luts, expected_codes = (
+            choose_qparams_and_quantize_codebook_coreml_original(
+                input_tensor, code_dtype, block_size.copy()
+            )
+        )
+
+        actual_luts, actual_codes = (
+            choose_qparams_and_quantize_codebook_coreml_refactored(
+                input_tensor, code_dtype, block_size.copy()
+            )
+        )
+
+        # Ensure codes are long for dequantize op compatibility
+        expected_codes = expected_codes.to(torch.long)
+        actual_codes = actual_codes.to(torch.long)
+
+        self.assertEqual(
+            actual_luts.shape,
+            expected_luts.shape,
+            "LUT shapes do not match after processing",
+        )
+        self.assertEqual(
+            actual_codes.shape, expected_codes.shape, "Code shapes do not match"
+        )
+
+        dequant_expected = dequantize_codebook(
+            expected_codes, expected_luts, nbits, block_size
+        )
+        dequant_actual = dequantize_codebook(
+            actual_codes, actual_luts, nbits, block_size
+        )
+
+        expected_error = torch.mean((input_tensor - dequant_expected) ** 2).item()
+        actual_error = torch.mean((input_tensor - dequant_actual) ** 2).item()
+
+        self.assertAlmostEqual(
+            actual_error,
+            expected_error,
+            delta=1e-5,
+            msg="Dequantization error differs significantly between implementations",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import tempfile
+import unittest
+
+import torch
+import torch.nn as nn
+from parameterized import param, parameterized
+from torch import uint1, uint2, uint3, uint4
+
+from torchao.prototype.quantization.codebook_groupwise.api import (
+    GroupwiseLutWeightConfig,
+)
+from torchao.prototype.quantization.codebook_utils.codebook_utils import (
+    group_size_to_block_shapes,
+)
+from torchao.quantization.quant_api import quantize_
+
+
+class TestGroupwiseLowbitWeightLut(unittest.TestCase):
+    """
+    Test suite for the GroupwiseLutWeight quantization scheme, updated for the
+    new simplified API.
+    """
+
+    TEST_CASES = [
+        param(
+            code_dtype=code_dtype,
+            lut_group_size=lut_group_size,
+            weight_dtype=weight_dtype,
+            has_bias=has_bias,
+        )
+        for code_dtype in [uint1, uint2, uint3, uint4]
+        for lut_group_size in [256, 512]
+        for weight_dtype in [torch.float32]
+        for has_bias in [True, False]
+    ]
+
+    # --------------------------------------------------------------------------
+    # Test 1: End-to-End Model Accuracy
+    # --------------------------------------------------------------------------
+    @parameterized.expand(TEST_CASES)
+    def test_e2e_accuracy_vs_reference(
+        self,
+        code_dtype,
+        lut_group_size,
+        weight_dtype,
+        has_bias,
+    ):
+        """
+        Tests the numerical accuracy of the full quantized model against a reference.
+        This now uses the `use_qdq_reference` flag instead of layout objects.
+        """
+        m, k, n = 3, 64, 32
+        activations = torch.randn(m, k, dtype=weight_dtype)
+        model = nn.Sequential(nn.Linear(k, n, bias=has_bias)).to(dtype=weight_dtype)
+
+        # --- 2. Update tensor_shape to reflect the new (k, n) layout ---
+        lut_block_shape = group_size_to_block_shapes(
+            lut_group_size=lut_group_size, tensor_shape=(n, k)
+        )
+
+        # --- Quantize using C++ ops ---
+        quantized_model = copy.deepcopy(model)
+        perf_config = GroupwiseLutWeightConfig(
+            code_dtype=code_dtype,
+            weight_dtype=weight_dtype,
+            lut_block_shape=lut_block_shape,
+            use_qdq_reference=False,
+        )
+        quantize_(quantized_model, perf_config)
+        with torch.no_grad():
+            actual_result = quantized_model(activations)
+
+        # --- Quantize for Reference (using Python ops) ---
+        reference_model = copy.deepcopy(model)
+        ref_config = GroupwiseLutWeightConfig(
+            code_dtype=code_dtype,
+            weight_dtype=weight_dtype,
+            lut_block_shape=lut_block_shape,
+            use_qdq_reference=True,
+        )
+        quantize_(reference_model, ref_config)
+        with torch.no_grad():
+            expected_result = reference_model(activations)
+        # Compare results
+        self.assertTrue(
+            torch.allclose(actual_result, expected_result, atol=1e-2, rtol=1e-2)
+        )
+
+    def tearDown(self):
+        """
+        Clear the TorchDynamo cache after each test case to prevent
+        recompilation errors in parameterized tests.
+        """
+        super().tearDown()
+        torch._dynamo.reset()
+
+    # --------------------------------------------------------------------------
+    # Test 2: Deployment Readiness (Updated for new API)
+    # --------------------------------------------------------------------------
+    @parameterized.expand(TEST_CASES)
+    def test_export_compile_aoti(
+        self,
+        code_dtype,
+        lut_group_size,
+        weight_dtype,
+        has_bias,
+    ):
+        """
+        Tests that the quantized model can be exported and compiled.
+        """
+        k, n = 64, 32
+        activations = torch.randn(2, k, dtype=weight_dtype)
+        model = (
+            nn.Sequential(nn.Linear(k, n, bias=has_bias)).to(dtype=weight_dtype).eval()
+        )
+        lut_block_shape = group_size_to_block_shapes(
+            lut_group_size=lut_group_size,
+            tensor_shape=(n, k),
+        )
+
+        # Configure the quantization using the new API
+        config = GroupwiseLutWeightConfig(
+            code_dtype=code_dtype,
+            weight_dtype=weight_dtype,
+            lut_block_shape=lut_block_shape,
+            use_qdq_reference=False,
+        )
+        quantize_(model, config)
+
+        with torch.no_grad():
+            eager_results = model(activations)
+
+        # Export and Compile
+        exported_model = torch.export.export(model, (activations,))
+        compiled_model = torch.compile(model, fullgraph=True)
+
+        with tempfile.TemporaryDirectory() as tmpdir, torch.no_grad():
+            # Check exported model
+            exported_results = exported_model.module()(activations)
+            self.assertTrue(
+                torch.allclose(eager_results, exported_results, atol=1e-3, rtol=1e-3)
+            )
+
+            # Check compiled model
+            compiled_results = compiled_model(activations)
+            self.assertTrue(
+                torch.allclose(eager_results, compiled_results, atol=1e-3, rtol=1e-3)
+            )
+
+            # Check AOTI compiled model using the packaging API
+            package_path = f"{tmpdir}/model.pt2"
+            torch._inductor.aoti_compile_and_package(
+                exported_model, package_path=package_path
+            )
+            aoti_model = torch._inductor.aoti_load_package(package_path)
+            aoti_results = aoti_model(activations)
+            self.assertTrue(
+                torch.allclose(eager_results, aoti_results, atol=1e-3, rtol=1e-3)
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
@@ -1,6 +1,7 @@
 from .api import CodebookWeightOnlyConfig
 from .codebook_ops import (
     choose_qparams_and_quantize_codebook_coreml,
+    choose_qparams_and_quantize_codebook_coreml_refactored,
     dequantize_codebook,
 )
 from .codebook_quantized_tensor import CodebookQuantizedTensor
@@ -9,5 +10,6 @@
     "CodebookQuantizedTensor",
     "CodebookWeightOnlyConfig",
     "choose_qparams_and_quantize_codebook_coreml",
+    "choose_qparams_and_quantize_codebook_coreml_refactored",
     "dequantize_codebook",
 ]
@@ -117,6 +117,112 @@ def choose_qparams_and_quantize_codebook_coreml(
 
     return res_lut, res_w
 
+def choose_qparams_and_quantize_codebook_coreml_refactored(
+    input_tensor: torch.Tensor,
+    code_dtype: torch.dtype,
+    block_size: List[int],
+    force_kmeans1d: bool = False,
+    cluster_dim: int = 1,
+    vector_axis: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Initialize the codebook using k-means clustering on blocks of the input tensor.
+
+    Args:
+        input_tensor (torch.Tensor): The input tensor to be quantized.
+        code_dtype (torch.dtype): The dtype for the codes. [torch.uint1, ..., torch.uint8]
+        block_size (List[int]): block sizes for how many elements in each dimension share
+           the same lookup table (len(block_size) == input_tensor.dim())
+           Each dimension of input_tensor must be divisible by the corresponding element of block_size
+           Look up tables are indexed by {(di // bi) for i in input_tensor.dim()}
+           For example, if the input tensor has shape (N, K), and block_size is (N, group_size), this means
+           there is a lookup table for group_size columns, i.e., (K // group_size) total look up tables
+        force_kmeans1d (bool): Use kmeans1d regardless of number of weights
+        cluster_dim (int): this means the size of the vector for vector lookup table quantization
+          e.g. when cluster_dim is 4, instead of quantizing each scalar value one by one, we quantize
+          the tensor in a unit of 4 element vectors, a vector of original tensor will be mapped to
+          a vector in the codebook (lookup table) based on the indices.
+        vector_axis (Optional[int]): used in vector quantization, see more docs in https://github.com/apple/coremltools/blob/1c0e5cb1c1e3ab759af107b54f2be18b7c03f8aa/coremltools/optimize/_utils.py#L371
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]  The codebook (lookup table) Tensor and the quantized Tensor (codes, torch.uint8)
+        The LUT table has dimension (g0, .., g(N-1), 2**nbits, vec_dim), where:
+         * The first N dimensions index over the different tables (gi = input_tensor.shape[i] // block_size[i] in each dimension)
+         * The N + 1 dimension indexes over the nbit indices (2 ** nbits)
+         * The N + 2 dimension indexes over the look up values (shape = 1 for scalar)
+    """
+    assert code_dtype in list(_SUB_BYTE_UINT_BOUNDS.keys()) + [torch.uint8]
+    nbits = _DTYPE_TO_BIT_WIDTH[code_dtype]
+    assert nbits >= 1 and nbits <= 8, f"nbits must be in [1, 8], got {nbits}"
+    assert input_tensor.dim() == 2, "Currently only rank 2 tensors are supported"
+    assert cluster_dim == 1, f"only cluster_dim == 1 is supported right now, got {cluster_dim}"
+
+    original_shape = input_tensor.shape
+    N, K = original_shape
+    input_tensor = input_tensor.detach()
+
+    # --- Process block_size ---
+    assert len(block_size) == 2
+    processed_block_size = block_size.copy()
+    if processed_block_size[0] == -1:
+        processed_block_size[0] = N
+    if processed_block_size[1] == -1:
+        processed_block_size[1] = K
+
+    row_block_size, col_block_size = processed_block_size
+    assert N % row_block_size == 0, f"Tensor rows ({N}) not divisible by row block size ({row_block_size})"
+    assert K % col_block_size == 0, f"Tensor cols ({K}) not divisible by col block size ({col_block_size})"
+
+    # --- Determine and execute grouping strategy ---
+    is_col_grouping = (col_block_size < K and row_block_size == N)
+    is_row_grouping = (row_block_size < N and col_block_size == K)
+    assert is_col_grouping or is_row_grouping, "Invalid block_size. Must group by either rows or columns, not both or neither."
+
+    res_lut_list = []
+    from coremltools.models.neural_network.quantization_utils import (
+        _get_kmeans_lookup_table_and_weight,
+    )
+    if is_col_grouping:
+        # STRATEGY 1: Group by COLUMNS (original behavior)
+        num_luts = K // col_block_size
+        reshaped_tensor = input_tensor.reshape(N, num_luts, col_block_size)
+        res_codes = torch.zeros_like(reshaped_tensor, dtype=torch.uint8)
+
+        for i in range(num_luts):
+            block_to_quantize = reshaped_tensor[:, i, :]
+            lut, w = _get_kmeans_lookup_table_and_weight(
+                nbits, block_to_quantize, force_kmeans1d, cluster_dim, vector_axis
+            )
+            res_lut_list.append(torch.from_numpy(lut))
+            res_codes[:, i, :] = torch.from_numpy(w.reshape(N, col_block_size))
+
+        # Shape to match CoreML spec: (1, num_luts, 2**nbits, 1)
+        final_luts = torch.stack(res_lut_list, dim=0).reshape(1, num_luts, 2**nbits, 1)
+
+    else: # is_row_grouping
+        # STRATEGY 2: Group by ROWS (your wrapper's behavior)
+        num_luts = N // row_block_size
+        reshaped_tensor = input_tensor.reshape(num_luts, row_block_size, K)
+        res_codes = torch.zeros_like(reshaped_tensor, dtype=torch.uint8)
+
+        for i in range(num_luts):
+            block_to_quantize = reshaped_tensor[i, :, :]
+            lut, w = _get_kmeans_lookup_table_and_weight(
+                nbits, block_to_quantize, force_kmeans1d, cluster_dim, vector_axis
+            )
+            res_lut_list.append(torch.from_numpy(lut))
+            res_codes[i, :, :] = torch.from_numpy(w.reshape(row_block_size, K))
+
+        final_luts_stacked = torch.stack(res_lut_list, dim=0) # Shape: (num_luts, 2**nbits, 1)
+
+        # Reshape to the consistent 4D format
+        # The shape is (num_row_groups, 1, 2**nbits, 1)
+        final_luts = final_luts_stacked.reshape(num_luts, 1, 2**nbits, 1)
+
+    # Reshape codes back to the original tensor shape
+    final_codes = res_codes.reshape(*original_shape)
+
+    return final_luts, final_codes
 
 @register_custom_op
 def dequantize_codebook(
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`from .api import CodebookWeightOnlyConfig`
`2`	`2`	`from .codebook_ops import (`
`3`	`3`	`choose_qparams_and_quantize_codebook_coreml,`
	`4`	`+ choose_qparams_and_quantize_codebook_coreml_refactored,`
`4`	`5`	`dequantize_codebook,`
`5`	`6`	`)`
`6`	`7`	`from .codebook_quantized_tensor import CodebookQuantizedTensor`
`@@ -9,5 +10,6 @@`
`9`	`10`	`"CodebookQuantizedTensor",`
`10`	`11`	`"CodebookWeightOnlyConfig",`
`11`	`12`	`"choose_qparams_and_quantize_codebook_coreml",`
	`13`	`+ "choose_qparams_and_quantize_codebook_coreml_refactored",`
`12`	`14`	`"dequantize_codebook",`
`13`	`15`	`]`