modelscope · Jintao-Huang · Oct 14, 2025 · Oct 13, 2025 · gemini-code-assist · Oct 13, 2025
diff --git a/docs/source/Instruction/导出与推送.md b/docs/source/Instruction/导出与推送.md
@@ -27,12 +27,15 @@ pip install autoawq -U
 # auto_gptq和cuda版本有对应关系，请按照`https://github.com/PanQiWei/AutoGPTQ#quick-installation`选择版本
 pip install auto_gptq optimum -U
 
+# 使用gptq v2量化:
+pip install gptqmodel optimum -U
+
 # 使用bnb量化：
 pip install bitsandbytes -U
 ```
 
 我们提供了一系列脚本展现SWIFT的量化导出能力：
-- 支持[AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh)量化导出。
+- 支持[AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[GPTQ v2](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq_v2.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh)量化导出。
 - 多模态量化: 支持使用GPTQ和AWQ对多模态模型进行量化，其中AWQ支持的多模态模型有限。参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/mllm)。
 - 更多系列模型的支持: 支持[Bert](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/bert)，[Reward Model](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/reward_model)的量化导出。
 - 使用SWIFT量化导出的模型支持使用vllm/sglang/lmdeploy进行推理加速；也支持使用QLoRA继续进行SFT/RLHF。

diff --git a/docs/source_en/Instruction/Export-and-push.md b/docs/source_en/Instruction/Export-and-push.md
@@ -26,13 +26,16 @@ pip install autoawq -U
 # The versions of auto_gptq and CUDA are correlated; please choose the version according to `https://github.com/PanQiWei/AutoGPTQ#quick-installation`.
 pip install auto_gptq optimum -U
 
+# For GPTQ v2 quantization:
+pip install gptqmodel optimum -U
+
 # For BNB quantization:
 pip install bitsandbytes -U
 ```
 
 We provide a series of scripts to demonstrate SWIFT's quantization export capabilities:
 
-- Supports [AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh) quantization exports.
+- Supports [AWQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/awq.sh)/[GPTQ](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq.sh)/[GPTQ v2](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/gptq_v2.sh)/[BNB](https://github.com/modelscope/ms-swift/blob/main/examples/export/quantize/bnb.sh) quantization exports.
 - Multimodal quantization: Supports quantizing multimodal models using GPTQ and AWQ, with limited multimodal models supported by AWQ. Refer to [here](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/mllm).
 - Support for more model series: Supports quantization exports for [BERT](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/bert) and [Reward Model](https://github.com/modelscope/ms-swift/tree/main/examples/export/quantize/reward_model).
 - Models exported with SWIFT's quantization support inference acceleration using vllm/sglang/lmdeploy; they also support further SFT/RLHF using QLoRA.

diff --git a/examples/export/quantize/gptq_v2.sh b/examples/export/quantize/gptq_v2.sh
@@ -0,0 +1,13 @@
+# OMP_NUM_THREADS=14 please Check issue: https://github.com/AutoGPTQ/AutoGPTQ/issues/439
+OMP_NUM_THREADS=14 \
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --model Qwen/Qwen2.5-1.5B-Instruct \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
-              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+        'AI-ModelScope/alpaca-gpt4-data-en#500' \
-              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+        'AI-ModelScope/alpaca-gpt4-data-en#500' \
+    --quant_n_samples 256 \
+    --quant_batch_size 1 \
+    --max_length 2048 \
+    --quant_method gptq_v2 \
+    --quant_bits 4 \
+    --output_dir Qwen2.5-1.5B-Instruct-GPTQ-Int4
diff --git a/swift/llm/argument/export_args.py b/swift/llm/argument/export_args.py
@@ -35,7 +35,7 @@ class ExportArguments(MergeArguments, BaseArguments):
     output_dir: Optional[str] = None
 
     # awq/gptq
-    quant_method: Literal['awq', 'gptq', 'bnb', 'fp8'] = None
+    quant_method: Literal['awq', 'gptq', 'bnb', 'fp8', 'gptq_v2'] = None
     quant_n_samples: int = 256
     max_length: int = 2048
     quant_batch_size: int = 1

diff --git a/swift/llm/export/quant.py b/swift/llm/export/quant.py
@@ -38,9 +38,9 @@ def quantize(self):
             self.awq_model_quantize()
             self.model.save_quantized(
                 args.output_dir, safetensors=args.safe_serialization, shard_size=args.max_shard_size)
-        elif args.quant_method == 'gptq':
+        elif args.quant_method in {'gptq', 'gptq_v2'}:
             self.template.model = self.model
-            gptq_quantizer = self.gptq_model_quantize()
+            gptq_quantizer = self.gptq_model_quantize(v2=(args.quant_method == 'gptq_v2'))
             gptq_quantizer.save(
                 self.model,
                 args.output_dir,
@@ -226,7 +226,7 @@ def get_modules_in_block_to_quantize(model, block_name: str):
         res[experts_idx:experts_idx] = experts.values()
         return res
 
-    def gptq_model_quantize(self):
+    def gptq_model_quantize(self, v2: bool = False):
         from optimum.gptq import GPTQQuantizer
         args = self.args
         logger.info(f'Quantization dataset: {args.dataset}')
@@ -241,7 +241,8 @@ def gptq_model_quantize(self):
                 dataset=','.join(args.dataset),
                 batch_size=args.quant_batch_size,
                 block_name_to_quantize=block_name_to_quantize,
-                modules_in_block_to_quantize=modules_in_block_to_quantize)
+                modules_in_block_to_quantize=modules_in_block_to_quantize,
+                checkpoint_format='gptq_v2' if v2 else 'gptq')
             gptq_quantizer.serialization_keys.append('block_name_to_quantize')
             logger.info('Start quantizing the model...')
             logger.warning('The process of packing the model takes a long time and there is no progress bar. '