diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index d9d6823ae..193b78b69 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -224,6 +224,7 @@ def _compile( custom_io: Optional[Dict[str, str]] = None, mdp_ts_num_devices: int = 1, num_speculative_tokens: Optional[int] = None, + mxfp6_matmul: bool = constants.DEFAULT_AIC_MXPF6_MATMUL, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, **compiler_options, @@ -239,6 +240,7 @@ def _compile( :custom_io (dict): Custom IO to specify the input and outputs in different formats than default :mdp_ts_num_devices (int): Number of devices to partition to use Multi-Device Partitioning with tensor-slicing. :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. + :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``. :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.`` :compiler_options: Pass any compiler option as input. @@ -269,7 +271,7 @@ def _compile( custom_io=custom_io, device_group=list(range(mdp_ts_num_devices)), num_cores=compiler_options.get("aic_num_cores", constants.DEFAULT_AIC_NUM_CORES), - mxfp6=compiler_options.get("mxfp6_matmul", constants.DEFAULT_AIC_MXPF6_MATMUL), + mxfp6=mxfp6_matmul, mxint8=mxint8_kv_cache, qnn_config=qnn_config, ) @@ -281,6 +283,9 @@ def _compile( if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None): command.append(f"-mdp-load-partition-config={mdp_ts_json_path}") + if mxfp6_matmul: + command.append("-mxfp6-matmul") + for key, value in compiler_options.items(): option = "-" + key.replace("_", "-") if isinstance(value, bool): diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py index 0f862b972..6b0eeb8c1 100644 --- a/QEfficient/compile/qnn_compiler.py +++ b/QEfficient/compile/qnn_compiler.py @@ -106,8 +106,17 @@ def parse_qnn_config(self): for key, value in config_data.items(): if key == QnnConstants.CONVERTER_ARGS_EXTENSION_STR: self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONVERTER_ARGS) - if key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR: + elif key == QnnConstants.CONTEXT_BIN_ARGS_EXTENSION_STR: self.check_extension_arg(key, value, QnnConstants.IMMUTABLE_CONTEXT_BIN_GEN_ARGS) + elif key == QnnConstants.QNN_COMPILATION_BACKEND_STR: + immutable_param = [ + sub_key for sub_key in value.keys() if sub_key in QnnConstants.IMMUTABLE_COMPILATION_BACKEND_ARGS + ] + if immutable_param: + raise AttributeError( + f"Immutable Parameters {immutable_param} found in {QnnConstants.QNN_COMPILATION_BACKEND_STR}. Please remove them from QNN Configuration file." + ) + self.qnn_config[key] = value def create_qnn_tensor_slicing_json(self) -> str: diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 5e855094c..8e24c83ae 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -190,6 +190,10 @@ class QnnConstants: "--config_file ", ] + IMMUTABLE_COMPILATION_BACKEND_ARGS = [ + "compiler_mxfp6_matmul_weights", + ] + QNN_SAMPLE_CONFIG = { "converter_args_extension": "--onnx_defer_loading", "context_binary_generator_args_extension": "--log_level debug",