intel · bopeng1234 · Mar 31, 2025 · Apr 7, 2025
diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp
@@ -1614,6 +1614,26 @@ MlasQuantizeBlockwise(
             }
             break;
 
+        case 3072:
+            if (columnwise) {
+                BlockwiseQuantizer<T, 3072, qbits, true>::quantizeAndTranspose(
+                    dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
+            } else {
+                BlockwiseQuantizer<T, 3072, qbits, false>::quantizeAndTranspose(
+                    dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
+            }
+            break;
+
+        case 8192:
+            if (columnwise) {
+                BlockwiseQuantizer<T, 8192, qbits, true>::quantizeAndTranspose(
+                    dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
+            } else {
+                BlockwiseQuantizer<T, 8192, qbits, false>::quantizeAndTranspose(
+                    dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool);
+            }
+            break;
+
         default:
             // Only block size 16, 32, 64, 128, 256 are supported.
             break;

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -191,6 +191,7 @@ def __init__(
         quant_format=QuantFormat.QOperator,
         op_types_to_quantize: tuple[str, ...] | None = None,
         quant_axes: tuple[tuple[str, int], ...] | None = None,
+        channel_wised_quantize: bool = False,
     ):
         """
         This is a class for weight only affine quantization configuration.
@@ -212,6 +213,8 @@ def __init__(
                 set of operator types to quantize.
             quant_axes (dict[str, int], optional):
                 op:axis, which axis to quantize for an op. Default {MatMul: 0, Gather: 1}
+            channel_wised_quantize (bool, optional):
+                whether use K (rows) as block size, channel wised quantization. Default is False.
         """
         super().__init__(
             algorithm="DEFAULT",
@@ -223,6 +226,7 @@ def __init__(
         self.is_symmetric = is_symmetric
         self.bits = 4
         self.accuracy_level = accuracy_level
+        self.channel_wised_quantize = channel_wised_quantize
 
 
 class NVAWQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
@@ -728,7 +732,8 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.nd
             raise ValueError("Current int4 block quantization only supports 2D tensors!")
         rows, cols = fp32weight.shape
 
-        block_size = self.config.block_size
+        # block size equal to rows (K) if channel wised quantize enabled
+        block_size = rows if self.config.channel_wised_quantize else self.config.block_size
         k_blocks = (rows + block_size - 1) // block_size
 
         if self.config.quant_format == QuantFormat.QOperator:
@@ -745,6 +750,22 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.nd
             quantize_matmul_4bits(
                 packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric
             )
+
+            # Quantize to int4 [-8, 7] when channel wise and symmetric quantize enabled.
+            # The packed uint4 is already symmetric quantization and +8 to uint4 [0, 15], bring it back to int4 [-8, 7].
+            # It saved a sub op when model infer, also meets the optimization pattern in Intel NPU to raise performance.
+            # Ref: https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/text_generation#npu-support
+            keep_int4 = True if self.config.channel_wised_quantize and self.config.is_symmetric else False
+            if keep_int4:
+                # Get uint4 Quantized data, convert to int4 by -8, and repack as uint8
+                high_4bit_u = (packed >> 4) & 0x0F
+                low_4bit_u = packed & 0x0F
+                high_4bit_i = high_4bit_u.astype(np.int8) - 8
+                low_4bit_i = low_4bit_u.astype(np.int8) - 8
+                high_4bit_requantized = np.clip(high_4bit_i, -8, 7) & 0x0F
+                low_4bit_requantized = np.clip(low_4bit_i, -8, 7) & 0x0F
+                packed = (high_4bit_requantized << 4) | low_4bit_requantized
+                packed = packed.astype(np.uint8)
         else:
             packed = np.zeros((rows * cols + 1) // 2, dtype="uint8")
             zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8")
@@ -801,7 +822,7 @@ def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> lis
             kwargs["K"] = rows
             kwargs["N"] = cols
             kwargs["bits"] = 4
-            kwargs["block_size"] = self.config.block_size
+            kwargs["block_size"] = rows if self.config.channel_wised_quantize else self.config.block_size
             if self.config.accuracy_level is not None:
                 kwargs["accuracy_level"] = self.config.accuracy_level
 
@@ -826,7 +847,8 @@ def quantize_matmul(self, node: NodeProto, graph_stack: list[GraphProto]) -> lis
                 )
                 dq_input_names.append(zp_tensor.name)
                 b_graph.initializer.extend([zp_tensor])
-            dq_kwargs = {"axis": 0, "block_size": self.config.block_size}
+            rows, cols = b_ndarray.shape
+            dq_kwargs = {"axis": 0, "block_size": rows if self.config.channel_wised_quantize else self.config.block_size}
             dq_node = onnx.helper.make_node(
                 "DequantizeLinear",
                 inputs=dq_input_names,