@@ -383,11 +383,12 @@ def _import_exllamav2_kernels():
383383 """Attempts to import ExLlamaV2 kernels for performance optimization."""
384384 try :
385385 from exllamav2_kernels import gemm_half_q_half , make_q_matrix # pylint: disable=E0611, E0401
386- except ImportError :
387- raise ImportError (
386+ except :
387+ logger . warning_once (
388388 "AutoGPTQ ExLlamaV2 has not been installed, Please install it using the following command: "
389389 "`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`"
390390 )
391+ logger .warning_once ("try to fallback to other autogptq backends for now" )
391392
392393
393394def _create_quant_layer (layer , layer_backend , config , in_features , out_features ):
@@ -520,19 +521,19 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
520521 else :
521522 backend = "auto"
522523
523-
524524 ##target_backend could be None
525525 _ , backend = parse_target_device_and_backend (backend )
526526
527- if hasattr (quantization_config , "packing_format" ): # pragma: no cover
527+ if hasattr (quantization_config ,
528+ "packing_format" ) and "auto-round" in quantization_config .quant_method : # pragma: no cover
528529 packing_format = quantization_config .packing_format
529530 elif 'gptq' in quantization_config .quant_method : # pragma: no cover
530531 packing_format = "auto_gptq"
531532 elif "awq" in quantization_config .quant_method :
532533 packing_format = "auto_awq"
533534 else : # pragma: no cover
534535 packing_format = "auto_gptq"
535- logger .warning ("Quantization backend must be specified. Set it to 'auto_gptq' by default." )
536+ logger .warning ("quantization backend must be specified. Set it to 'auto_gptq' by default." )
536537 if packing_format == "auto" :
537538 packing_format = "auto_gptq"
538539
0 commit comments