Skip to content

Commit ce56cc3

Browse files
committed
[CI]【Hackathon 10th Spring No.44】fix: robust deep_gemm stub for CUDA CI
_GpuOpsStub.__getattr__ now resolves registered sub-modules from sys.modules before returning None. Explicit dummy attributes on _deep_gemm_stub prevent AttributeError in fp8_utils import chain.
1 parent b1abb42 commit ce56cc3

File tree

2 files changed

+46
-8
lines changed

2 files changed

+46
-8
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,5 @@ custom_ops/gpu_ops/w4afp8_gemm/w4afp8_gemm_template.h
181181

182182
custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_*.cu
183183
custom_ops/gpu_ops/wfp8afp8_sparse_gemm/wfp8Afp8_sparse_gemm_template.h
184+
python_coverage_all.xml
185+
diff_coverage.json

tests/layers/test_fused_moe_deepgemm_backend.py

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,19 +27,28 @@
2727

2828

2929
class _GpuOpsStub(types.ModuleType):
30-
"""Catchall module: any attribute access returns None."""
30+
"""Catchall module: returns registered sub-modules or None for unknown attrs."""
3131

3232
__path__ = [] # marks as package so `import X.Y.Z` can traverse
3333

3434
def __getattr__(self, name):
35+
# Return registered sub-modules from sys.modules so `from X import Y` works
36+
fqn = f"{self.__name__}.{name}"
37+
sub = sys.modules.get(fqn)
38+
if sub is not None:
39+
return sub
3540
return None
3641

3742

3843
sys.modules["fastdeploy.model_executor.ops.gpu"] = _GpuOpsStub("fastdeploy.model_executor.ops.gpu")
3944
# fp8_utils.py:52 uses `import ...ops.gpu.deep_gemm as deep_gemm`
40-
sys.modules["fastdeploy.model_executor.ops.gpu.deep_gemm"] = types.ModuleType(
41-
"fastdeploy.model_executor.ops.gpu.deep_gemm"
42-
)
45+
_deep_gemm_stub = types.ModuleType("fastdeploy.model_executor.ops.gpu.deep_gemm")
46+
# Provide dummy callables so `deep_gemm.m_grouped_*` attribute access succeeds
47+
_deep_gemm_stub.m_grouped_fp8_gemm_nt_contiguous = None
48+
_deep_gemm_stub.m_grouped_fp8_gemm_nt_masked = None
49+
_deep_gemm_stub.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous = None
50+
_deep_gemm_stub.m_grouped_gemm_fp8_fp8_bf16_nt_masked = None
51+
sys.modules["fastdeploy.model_executor.ops.gpu.deep_gemm"] = _deep_gemm_stub
4352
_gpu = sys.modules["fastdeploy.model_executor.ops.gpu"]
4453

4554
_ep_mod = types.ModuleType("fastdeploy.model_executor.layers.moe.ep")
@@ -89,9 +98,9 @@ def __init__(self, experts=1, hidden=4, inter=2):
8998
model="test",
9099
moe_phase=SimpleNamespace(phase="prefill"),
91100
),
92-
scheduler_config=SimpleNamespace(splitwise_role="prefill"),
101+
scheduler_config=SimpleNamespace(splitwise_role="prefill", max_num_batched_tokens=4),
93102
eplb_config=SimpleNamespace(redundant_experts_num=0),
94-
parallel_config=SimpleNamespace(ep_group=None, use_internode_ll_two_stage=False),
103+
parallel_config=SimpleNamespace(ep_group=None, use_internode_ll_two_stage=False, tensor_parallel_size=1),
95104
load_config=SimpleNamespace(load_strategy="meta", load_choices="default_v1"),
96105
)
97106
self.weight_key_map = {
@@ -303,10 +312,10 @@ def test_apply_ep_prefill(monkeypatch):
303312
H = layer.hidden_size
304313

305314
class _PrefillRunner:
306-
def __init__(self, n):
315+
def __init__(self, n, num_worst_tokens=0):
307316
self._n = n
308317
self.ep_engine = SimpleNamespace(async_finish=True)
309-
self.num_worst_tokens = 0
318+
self.num_worst_tokens = num_worst_tokens
310319

311320
def moe_select(self, _layer, gate_out):
312321
return paddle.zeros([gate_out.shape[0], 1], "int64"), paddle.ones([gate_out.shape[0], 1], "float32")
@@ -374,6 +383,33 @@ def combine(self, out, _handle, _weights, event):
374383
out_phi = m.apply_ep_prefill(layer, x, gate, topk_ids_hookfunc=lambda **_: None)
375384
assert out_phi.shape[-1] == H
376385

386+
# num_worst_tokens > 0 branch — covers L410-482 (masked gemm path)
387+
monkeypatch.setattr(dgb.fastdeploy.envs, "FD_USE_PHI_FP8_QUANT", False)
388+
monkeypatch.setattr(
389+
dgb,
390+
"call_prefill_permute_to_masked_gemm",
391+
lambda x, scale, topk_ids, num_local_experts, max_token_num: (
392+
x,
393+
scale,
394+
paddle.zeros([num_local_experts, max_token_num, 1], "int32"),
395+
paddle.zeros([num_local_experts], "int32"),
396+
),
397+
)
398+
monkeypatch.setattr(dgb, "m_grouped_fp8_gemm_nt_masked", lambda *_a, **_kw: None)
399+
monkeypatch.setattr(
400+
_gpu,
401+
"fused_mask_swiglu_fp8_quant",
402+
lambda t, tn, bs, **kw: (paddle.zeros_like(t), paddle.zeros([1], "float32")),
403+
)
404+
monkeypatch.setattr(
405+
dgb,
406+
"call_depermute_prefill_combine",
407+
lambda x, indice_map, topk_weights, num_worst_tokens: paddle.zeros([num_worst_tokens, x.shape[-1]], "float32"),
408+
)
409+
m.ep_prefill_runner = _PrefillRunner(n=2, num_worst_tokens=2)
410+
out_worst = m.apply_ep_prefill(layer, x, gate, topk_ids_hookfunc=lambda **_: None)
411+
assert out_worst.shape[-1] == H
412+
377413

378414
def test_apply_ep_decode(monkeypatch):
379415
"""apply_ep_decode."""

0 commit comments

Comments
 (0)