diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82b005e84a8d3..5817fb4e123ee 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7997,7 +7997,6 @@ def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor): def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: blocks0: Tensor = torch.zeros(1) blocks1: Tensor = torch.zeros(1) - found_mxfp4_tensors = False # we assume that tensors are loaded in the correct order for name, data_torch in self.get_tensors(): if "mlp.experts.down_proj_blocks" in name: @@ -8005,7 +8004,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: elif "mlp.experts.down_proj_scales" in name: new_name = self.map_tensor_name(name.replace("_scales", ".weight")) self.repack_mxfp4(new_name, blocks0, data_torch) - found_mxfp4_tensors = True elif "mlp.experts.gate_up_proj_blocks" in name: blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :] elif "mlp.experts.gate_up_proj_scales" in name: @@ -8014,9 +8012,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight")) self.repack_mxfp4(new_name_gate, blocks0, scales0) self.repack_mxfp4(new_name_up, blocks1, scales1) - found_mxfp4_tensors = True - if not found_mxfp4_tensors: - raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.") return [] def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -8029,7 +8024,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if "down_proj" in name: if name.endswith("_bias"): name = name.replace("down_proj_bias", "down_proj.bias") + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name = name.replace("down_proj", "down_proj.weight") + data_torch = data_torch.transpose(-1, -2) else: + # otherwise, it should already be repacked to ggml MXFP4 format return [] # split the gate_up into gate and up @@ -8042,7 +8042,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter (self.map_tensor_name(name_gate), gate_proj_bias), (self.map_tensor_name(name_up), up_proj_bias) ] + elif "_blocks" not in name and "_scales" not in name: + logger.warning(f"{name} is not in MXFP4, performance may be degraded") + name_up = name.replace("gate_up_proj", "up_proj.weight") + name_gate = name.replace("gate_up_proj", "gate_proj.weight") + data_torch = data_torch.transpose(-1, -2) + gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :] + return [ + (self.map_tensor_name(name_gate), gate_proj_weight), + (self.map_tensor_name(name_up), up_proj_weight) + ] else: + # otherwise, it should already be repacked to ggml MXFP4 format return [] return [(self.map_tensor_name(name), data_torch)] diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 9c0ffde27a1e3..1d0361cc16659 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); // TODO: temporary sanity check that the F16 -> MXFP4 is lossless -#if 1 +#if 0 if (new_type == GGML_TYPE_MXFP4) { auto * x = f32_data_03;