|
27 | 27 |
|
28 | 28 |
|
29 | 29 | class _GpuOpsStub(types.ModuleType): |
30 | | - """Catchall module: any attribute access returns None.""" |
| 30 | + """Catchall module: returns registered sub-modules or None for unknown attrs.""" |
31 | 31 |
|
32 | 32 | __path__ = [] # marks as package so `import X.Y.Z` can traverse |
33 | 33 |
|
34 | 34 | def __getattr__(self, name): |
| 35 | + # Return registered sub-modules from sys.modules so `from X import Y` works |
| 36 | + fqn = f"{self.__name__}.{name}" |
| 37 | + sub = sys.modules.get(fqn) |
| 38 | + if sub is not None: |
| 39 | + return sub |
35 | 40 | return None |
36 | 41 |
|
37 | 42 |
|
38 | 43 | sys.modules["fastdeploy.model_executor.ops.gpu"] = _GpuOpsStub("fastdeploy.model_executor.ops.gpu") |
39 | 44 | # fp8_utils.py:52 uses `import ...ops.gpu.deep_gemm as deep_gemm` |
40 | | -sys.modules["fastdeploy.model_executor.ops.gpu.deep_gemm"] = types.ModuleType( |
41 | | - "fastdeploy.model_executor.ops.gpu.deep_gemm" |
42 | | -) |
| 45 | +_deep_gemm_stub = types.ModuleType("fastdeploy.model_executor.ops.gpu.deep_gemm") |
| 46 | +# Provide dummy callables so `deep_gemm.m_grouped_*` attribute access succeeds |
| 47 | +_deep_gemm_stub.m_grouped_fp8_gemm_nt_contiguous = None |
| 48 | +_deep_gemm_stub.m_grouped_fp8_gemm_nt_masked = None |
| 49 | +_deep_gemm_stub.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous = None |
| 50 | +_deep_gemm_stub.m_grouped_gemm_fp8_fp8_bf16_nt_masked = None |
| 51 | +sys.modules["fastdeploy.model_executor.ops.gpu.deep_gemm"] = _deep_gemm_stub |
43 | 52 | _gpu = sys.modules["fastdeploy.model_executor.ops.gpu"] |
44 | 53 |
|
45 | 54 | _ep_mod = types.ModuleType("fastdeploy.model_executor.layers.moe.ep") |
@@ -89,9 +98,9 @@ def __init__(self, experts=1, hidden=4, inter=2): |
89 | 98 | model="test", |
90 | 99 | moe_phase=SimpleNamespace(phase="prefill"), |
91 | 100 | ), |
92 | | - scheduler_config=SimpleNamespace(splitwise_role="prefill"), |
| 101 | + scheduler_config=SimpleNamespace(splitwise_role="prefill", max_num_batched_tokens=4), |
93 | 102 | eplb_config=SimpleNamespace(redundant_experts_num=0), |
94 | | - parallel_config=SimpleNamespace(ep_group=None, use_internode_ll_two_stage=False), |
| 103 | + parallel_config=SimpleNamespace(ep_group=None, use_internode_ll_two_stage=False, tensor_parallel_size=1), |
95 | 104 | load_config=SimpleNamespace(load_strategy="meta", load_choices="default_v1"), |
96 | 105 | ) |
97 | 106 | self.weight_key_map = { |
@@ -303,10 +312,10 @@ def test_apply_ep_prefill(monkeypatch): |
303 | 312 | H = layer.hidden_size |
304 | 313 |
|
305 | 314 | class _PrefillRunner: |
306 | | - def __init__(self, n): |
| 315 | + def __init__(self, n, num_worst_tokens=0): |
307 | 316 | self._n = n |
308 | 317 | self.ep_engine = SimpleNamespace(async_finish=True) |
309 | | - self.num_worst_tokens = 0 |
| 318 | + self.num_worst_tokens = num_worst_tokens |
310 | 319 |
|
311 | 320 | def moe_select(self, _layer, gate_out): |
312 | 321 | return paddle.zeros([gate_out.shape[0], 1], "int64"), paddle.ones([gate_out.shape[0], 1], "float32") |
@@ -374,6 +383,33 @@ def combine(self, out, _handle, _weights, event): |
374 | 383 | out_phi = m.apply_ep_prefill(layer, x, gate, topk_ids_hookfunc=lambda **_: None) |
375 | 384 | assert out_phi.shape[-1] == H |
376 | 385 |
|
| 386 | + # num_worst_tokens > 0 branch — covers L410-482 (masked gemm path) |
| 387 | + monkeypatch.setattr(dgb.fastdeploy.envs, "FD_USE_PHI_FP8_QUANT", False) |
| 388 | + monkeypatch.setattr( |
| 389 | + dgb, |
| 390 | + "call_prefill_permute_to_masked_gemm", |
| 391 | + lambda x, scale, topk_ids, num_local_experts, max_token_num: ( |
| 392 | + x, |
| 393 | + scale, |
| 394 | + paddle.zeros([num_local_experts, max_token_num, 1], "int32"), |
| 395 | + paddle.zeros([num_local_experts], "int32"), |
| 396 | + ), |
| 397 | + ) |
| 398 | + monkeypatch.setattr(dgb, "m_grouped_fp8_gemm_nt_masked", lambda *_a, **_kw: None) |
| 399 | + monkeypatch.setattr( |
| 400 | + _gpu, |
| 401 | + "fused_mask_swiglu_fp8_quant", |
| 402 | + lambda t, tn, bs, **kw: (paddle.zeros_like(t), paddle.zeros([1], "float32")), |
| 403 | + ) |
| 404 | + monkeypatch.setattr( |
| 405 | + dgb, |
| 406 | + "call_depermute_prefill_combine", |
| 407 | + lambda x, indice_map, topk_weights, num_worst_tokens: paddle.zeros([num_worst_tokens, x.shape[-1]], "float32"), |
| 408 | + ) |
| 409 | + m.ep_prefill_runner = _PrefillRunner(n=2, num_worst_tokens=2) |
| 410 | + out_worst = m.apply_ep_prefill(layer, x, gate, topk_ids_hookfunc=lambda **_: None) |
| 411 | + assert out_worst.shape[-1] == H |
| 412 | + |
377 | 413 |
|
378 | 414 | def test_apply_ep_decode(monkeypatch): |
379 | 415 | """apply_ep_decode.""" |
|
0 commit comments