Skip to content

convert : support non-mxfp4 HF model #15153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7997,15 +7997,13 @@ def repack_mxfp4(self, new_name: str, blocks: Tensor, scales: Tensor):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
blocks0: Tensor = torch.zeros(1)
blocks1: Tensor = torch.zeros(1)
found_mxfp4_tensors = False
# we assume that tensors are loaded in the correct order
for name, data_torch in self.get_tensors():
if "mlp.experts.down_proj_blocks" in name:
blocks0 = data_torch
elif "mlp.experts.down_proj_scales" in name:
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
self.repack_mxfp4(new_name, blocks0, data_torch)
found_mxfp4_tensors = True
elif "mlp.experts.gate_up_proj_blocks" in name:
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
elif "mlp.experts.gate_up_proj_scales" in name:
Expand All @@ -8014,9 +8012,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
self.repack_mxfp4(new_name_gate, blocks0, scales0)
self.repack_mxfp4(new_name_up, blocks1, scales1)
found_mxfp4_tensors = True
if not found_mxfp4_tensors:
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
return []

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
Expand All @@ -8029,7 +8024,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
if "down_proj" in name:
if name.endswith("_bias"):
name = name.replace("down_proj_bias", "down_proj.bias")
elif "_blocks" not in name and "_scales" not in name:
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
name = name.replace("down_proj", "down_proj.weight")
data_torch = data_torch.transpose(-1, -2)
else:
# otherwise, it should already be repacked to ggml MXFP4 format
return []

# split the gate_up into gate and up
Expand All @@ -8042,7 +8042,18 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
(self.map_tensor_name(name_gate), gate_proj_bias),
(self.map_tensor_name(name_up), up_proj_bias)
]
elif "_blocks" not in name and "_scales" not in name:
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
name_up = name.replace("gate_up_proj", "up_proj.weight")
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
data_torch = data_torch.transpose(-1, -2)
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
return [
(self.map_tensor_name(name_gate), gate_proj_weight),
(self.map_tensor_name(name_up), up_proj_weight)
]
else:
# otherwise, it should already be repacked to ggml MXFP4 format
return []

return [(self.map_tensor_name(name), data_torch)]
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);

// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
#if 1
#if 0
Comment on lines 1001 to +1002
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For vis @ggerganov , I disable this check because most users will now using this code branch to convert fine-tuned model to MXFP4, which will no longer be lossless.

Although, I'm a bit doubt if fine-tuned models like the abliterated version should be quantize to something other than MXFP4 or not

@gabriellarson Could you also try converting it to Q4_K_M to see if it impacts the quality?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah nevermind, it's not possible to quantize to Q4_K since the tensor shape is not divisible by 256

if (new_type == GGML_TYPE_MXFP4) {
auto * x = f32_data_03;

Expand Down
Loading