Skip to content

Commit 1cb8661

Browse files
author
cloudforge1
committed
add CPU-side unit tests for MiniCPM4 μP scaling, weight mapping, and registration
1 parent 59758a5 commit 1cb8661

File tree

1 file changed

+320
-0
lines changed

1 file changed

+320
-0
lines changed
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
import math
18+
19+
import numpy as np
20+
import paddle
21+
import pytest
22+
23+
# ── μP scaling math tests (pure computation, no FD imports needed) ──────────
24+
25+
26+
class TestMuPScaling:
27+
"""Test μP (Maximal Update Parametrization) scaling factors.
28+
29+
MiniCPM4 applies three scaling sites:
30+
1. Embedding: output *= scale_emb
31+
2. Residual: hidden_states *= scale_depth / sqrt(num_hidden_layers)
32+
3. LM head: hidden_states /= (hidden_size / dim_model_base)
33+
"""
34+
35+
# Reference config values from openbmb/MiniCPM4.1-8B
36+
SCALE_EMB = 12
37+
SCALE_DEPTH = 1.4
38+
NUM_HIDDEN_LAYERS = 32
39+
HIDDEN_SIZE = 4096
40+
DIM_MODEL_BASE = 256
41+
42+
def test_embedding_scaling(self):
43+
"""Embedding output scaled by scale_emb."""
44+
x = paddle.ones([2, 8, self.HIDDEN_SIZE], dtype="float32")
45+
scaled = x * self.SCALE_EMB
46+
np.testing.assert_allclose(
47+
scaled.numpy(),
48+
np.full([2, 8, self.HIDDEN_SIZE], 12.0, dtype="float32"),
49+
)
50+
51+
def test_residual_scaling_value(self):
52+
"""Residual scale = scale_depth / sqrt(num_hidden_layers)."""
53+
expected = self.SCALE_DEPTH / math.sqrt(self.NUM_HIDDEN_LAYERS)
54+
assert abs(expected - 0.24748737341529164) < 1e-10
55+
56+
def test_residual_scaling_applied(self):
57+
"""Hidden states scaled by residual_scale before residual add."""
58+
residual_scale = self.SCALE_DEPTH / math.sqrt(self.NUM_HIDDEN_LAYERS)
59+
x = paddle.full([4, self.HIDDEN_SIZE], 2.0, dtype="float32")
60+
scaled = x * residual_scale
61+
np.testing.assert_allclose(
62+
scaled.numpy(),
63+
np.full([4, self.HIDDEN_SIZE], 2.0 * residual_scale, dtype="float32"),
64+
rtol=1e-6,
65+
)
66+
67+
def test_lm_head_scaling(self):
68+
"""LM head input divided by hidden_size / dim_model_base."""
69+
lm_head_scale = self.HIDDEN_SIZE / self.DIM_MODEL_BASE
70+
assert lm_head_scale == 16.0
71+
72+
x = paddle.full([4, self.HIDDEN_SIZE], 32.0, dtype="float32")
73+
scaled = x / lm_head_scale
74+
np.testing.assert_allclose(
75+
scaled.numpy(),
76+
np.full([4, self.HIDDEN_SIZE], 2.0, dtype="float32"),
77+
)
78+
79+
def test_lm_head_scale_fallback(self):
80+
"""When dim_model_base is None or 0, lm_head_scale defaults to 1.0."""
81+
for dim_model_base in [None, 0]:
82+
if dim_model_base is not None and dim_model_base > 0:
83+
scale = self.HIDDEN_SIZE / dim_model_base
84+
else:
85+
scale = 1.0
86+
assert scale == 1.0
87+
88+
def test_residual_scale_depth_default(self):
89+
"""When scale_depth not in config, defaults to 1.0 → no scaling."""
90+
scale_depth = 1.0 # default
91+
residual_scale = scale_depth / math.sqrt(self.NUM_HIDDEN_LAYERS)
92+
x = paddle.full([4, self.HIDDEN_SIZE], 1.0, dtype="float32")
93+
scaled = x * residual_scale
94+
expected = 1.0 / math.sqrt(32)
95+
np.testing.assert_allclose(scaled.numpy().mean(), expected, rtol=1e-6)
96+
97+
98+
# ── Weight mapping tests ────────────────────────────────────────────────────
99+
100+
101+
class TestWeightMapping:
102+
"""Test HuggingFace → FastDeploy weight name mapping."""
103+
104+
STACKED_PARAMS = [
105+
("qkv_proj", "q_proj", "q"),
106+
("qkv_proj", "k_proj", "k"),
107+
("qkv_proj", "v_proj", "v"),
108+
("up_gate_proj", "gate_proj", "gate"),
109+
("up_gate_proj", "up_proj", "up"),
110+
("embed_tokens.embeddings", "embed_tokens", None),
111+
("lm_head.linear", "lm_head", None),
112+
]
113+
114+
def test_hf_prefix_rename(self):
115+
"""HF 'model.' prefix maps to FD 'minicpm4.' prefix."""
116+
hf_names = [
117+
"model.layers.0.self_attn.q_proj.weight",
118+
"model.embed_tokens.weight",
119+
"model.norm.weight",
120+
"lm_head.weight", # no model. prefix
121+
]
122+
for name in hf_names:
123+
fd_name = name.replace("model.", "minicpm4.")
124+
if name.startswith("model."):
125+
assert fd_name.startswith("minicpm4.")
126+
else:
127+
assert fd_name == name # lm_head unchanged
128+
129+
def test_qkv_stacking(self):
130+
"""q_proj, k_proj, v_proj map to qkv_proj with correct shard_id."""
131+
qkv_map = {wn: (pn, sid) for pn, wn, sid in self.STACKED_PARAMS if "proj" in wn and sid in ("q", "k", "v")}
132+
assert qkv_map["q_proj"] == ("qkv_proj", "q")
133+
assert qkv_map["k_proj"] == ("qkv_proj", "k")
134+
assert qkv_map["v_proj"] == ("qkv_proj", "v")
135+
136+
def test_gate_up_stacking(self):
137+
"""gate_proj, up_proj map to up_gate_proj."""
138+
gu_map = {wn: (pn, sid) for pn, wn, sid in self.STACKED_PARAMS if sid in ("gate", "up")}
139+
assert gu_map["gate_proj"] == ("up_gate_proj", "gate")
140+
assert gu_map["up_proj"] == ("up_gate_proj", "up")
141+
142+
def test_embed_and_lm_head_rename(self):
143+
"""embed_tokens → embed_tokens.embeddings, lm_head → lm_head.linear."""
144+
rename_map = {wn: pn for pn, wn, sid in self.STACKED_PARAMS if sid is None}
145+
assert rename_map["embed_tokens"] == "embed_tokens.embeddings"
146+
assert rename_map["lm_head"] == "lm_head.linear"
147+
148+
def test_weight_name_replacement(self):
149+
"""Full pipeline: HF name → prefix rename → stacked param rename."""
150+
hf_name = "model.layers.5.self_attn.q_proj.weight"
151+
# Step 1: prefix rename
152+
fd_name = hf_name.replace("model.", "minicpm4.")
153+
assert fd_name == "minicpm4.layers.5.self_attn.q_proj.weight"
154+
155+
# Step 2: stacked param rename
156+
for param_name, weight_name, shard_id in self.STACKED_PARAMS:
157+
if weight_name in fd_name:
158+
model_param_name = fd_name.replace(weight_name, param_name)
159+
assert model_param_name == "minicpm4.layers.5.self_attn.qkv_proj.weight"
160+
assert shard_id == "q"
161+
break
162+
163+
164+
# ── Registration & config tests ─────────────────────────────────────────────
165+
166+
167+
class TestRegistration:
168+
"""Test model architecture registration string."""
169+
170+
def test_architecture_string(self):
171+
"""MiniCPM4 registers as 'MiniCPMForCausalLM' (matching HF config)."""
172+
# The decorator uses architecture="MiniCPMForCausalLM"
173+
# Verify by reading the source file directly
174+
import ast
175+
import os
176+
177+
model_file = os.path.join(
178+
os.path.dirname(__file__),
179+
"..",
180+
"..",
181+
"fastdeploy",
182+
"model_executor",
183+
"models",
184+
"minicpm4.py",
185+
)
186+
with open(model_file) as f:
187+
tree = ast.parse(f.read())
188+
189+
# Find the register_model_class decorator
190+
found_arch = None
191+
for node in ast.walk(tree):
192+
if isinstance(node, ast.Call):
193+
for kw in node.keywords:
194+
if kw.arg == "architecture" and isinstance(kw.value, ast.Constant):
195+
found_arch = kw.value.value
196+
break
197+
assert found_arch == "MiniCPMForCausalLM"
198+
199+
def test_module_name_is_minicpm4(self):
200+
"""The module_name in registration is 'minicpm4'."""
201+
import ast
202+
import os
203+
204+
model_file = os.path.join(
205+
os.path.dirname(__file__),
206+
"..",
207+
"..",
208+
"fastdeploy",
209+
"model_executor",
210+
"models",
211+
"minicpm4.py",
212+
)
213+
with open(model_file) as f:
214+
tree = ast.parse(f.read())
215+
216+
found_module = None
217+
for node in ast.walk(tree):
218+
if isinstance(node, ast.Call):
219+
for kw in node.keywords:
220+
if kw.arg == "module_name" and isinstance(kw.value, ast.Constant):
221+
found_module = kw.value.value
222+
break
223+
assert found_module == "minicpm4"
224+
225+
def test_model_classes_exist(self):
226+
"""Source file defines all 6 expected classes."""
227+
import ast
228+
import os
229+
230+
model_file = os.path.join(
231+
os.path.dirname(__file__),
232+
"..",
233+
"..",
234+
"fastdeploy",
235+
"model_executor",
236+
"models",
237+
"minicpm4.py",
238+
)
239+
with open(model_file) as f:
240+
tree = ast.parse(f.read())
241+
242+
class_names = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
243+
expected = [
244+
"MiniCPM4MLP",
245+
"MiniCPM4Attention",
246+
"MiniCPM4DecoderLayer",
247+
"MiniCPM4Model",
248+
"MiniCPM4ForCausalLM",
249+
"MiniCPM4PretrainedModel",
250+
]
251+
for name in expected:
252+
assert name in class_names, f"Missing class: {name}"
253+
254+
def test_no_qkv_bias(self):
255+
"""MiniCPM4Attention uses with_bias=False (unlike Qwen2)."""
256+
import ast
257+
import os
258+
259+
model_file = os.path.join(
260+
os.path.dirname(__file__),
261+
"..",
262+
"..",
263+
"fastdeploy",
264+
"model_executor",
265+
"models",
266+
"minicpm4.py",
267+
)
268+
with open(model_file) as f:
269+
source = f.read()
270+
tree = ast.parse(source)
271+
272+
# Find QKVParallelLinear call inside MiniCPM4Attention
273+
for node in ast.walk(tree):
274+
if isinstance(node, ast.ClassDef) and node.name == "MiniCPM4Attention":
275+
for child in ast.walk(node):
276+
if isinstance(child, ast.Call):
277+
for kw in child.keywords:
278+
if kw.arg == "with_bias" and isinstance(kw.value, ast.Constant):
279+
assert kw.value.value is False, "QKV should have with_bias=False"
280+
return
281+
pytest.fail("with_bias keyword not found in MiniCPM4Attention.QKVParallelLinear")
282+
283+
284+
# ── compute_logits logic test ───────────────────────────────────────────────
285+
286+
287+
class TestComputeLogits:
288+
"""Test the compute_logits μP scaling and vocab masking logic."""
289+
290+
def test_lm_head_scaling_and_vocab_mask(self):
291+
"""compute_logits divides by lm_head_scale and masks extended vocab."""
292+
hidden_size = 128
293+
ori_vocab_size = 100
294+
vocab_size = 128 # extended
295+
lm_head_scale = 16.0
296+
297+
# Simulate hidden_states
298+
hidden_states = paddle.full([4, hidden_size], 32.0, dtype="float32")
299+
300+
# Step 1: μP scaling
301+
scaled = hidden_states / lm_head_scale
302+
np.testing.assert_allclose(scaled.numpy().mean(), 2.0, rtol=1e-6)
303+
304+
# Step 2: Simulate lm_head projection (linear: hidden→vocab)
305+
weight = paddle.ones([vocab_size, hidden_size], dtype="float32")
306+
logits = paddle.matmul(scaled, weight.T)
307+
logits = logits.astype(paddle.float32)
308+
309+
# Step 3: Mask extended vocab positions
310+
logits[:, ori_vocab_size:] = -float("inf")
311+
312+
assert logits.shape == [4, vocab_size]
313+
# Valid vocab positions should be finite
314+
assert paddle.isfinite(logits[:, :ori_vocab_size]).all()
315+
# Extended positions should be -inf
316+
assert (logits[:, ori_vocab_size:] == -float("inf")).all()
317+
318+
319+
if __name__ == "__main__":
320+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)