Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,9 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE
self.cuda_graph_manager.batch_size = entry.real_shape
entry.captured = True
with self.cuda_graph_manager.run_impl_guard():
entry.runnable(**kwargs)
with capture_custom_allreduce():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

静态图也用custom all reduce 对吧

Copy link
Contributor Author

@DrRyanHuang DrRyanHuang Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

对,目前静态图 Custom AllReduce 和 Paddle AllReduce 都支持

但是使用Custom AllReduce,需要加参数 --max-num-batched-tokens 500,后面这个数小于500就行,具体原因后面继续排查~

with self.cuda_graph_manager.run_impl_guard():
entry.runnable(**kwargs)

# Replay
self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY
Expand Down
1 change: 1 addition & 0 deletions fastdeploy/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1575,6 +1575,7 @@ def _update_chunked_prefill(self, tasks):
self.proposer.update_task_chunk_prefill(task)
task.chunk_idx += 1

@sot_warmup_guard(True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SOT的 Warm Up 延后是为了避免 custom all reduce 的什么问题呢

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

custom_all_reduce 中,只有在 Capture 的时候才会走第一个 if self.capturing:分支
但是,之前 SOT Warmup 在 Capture Model 前面,也就是IR图在 CUDAGraph Capture 之前就已经确定了,走 else 分支,这就导致 replay 的时候也走 else 分支,这是不对的

def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
"""The main allreduce API that provides support for cuda graph."""
if self.capturing:
lib = cuda_wrapper.CudaRTLibrary()
stream = paddle.device.current_stream()
stream_capturing = lib.cudaStreamIsCapturing(stream)
if stream_capturing.value == 1:
# 1 is cudaStreamCaptureStatusActive: The stream is capturing.
return self.all_reduce(input, input, registered=True)
else:
# If warm up, mimic the allocation pattern since custom
# allreduce is out-of-place.
return paddle.empty_like(input)
else:
return self.all_reduce(input, input, registered=False)

def capture_model(self) -> None:
"""
Trigger CUDA Graph capture for all shapes in cuda graph capture list
Expand Down
2 changes: 1 addition & 1 deletion fastdeploy/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def graph_optimize_and_warm_up_model(self) -> None:
"""
Perform the warm-up and the graph optimization
"""
if self.fd_config.graph_opt_config.graph_opt_level >= 1:
if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph:
self.model_runner.sot_warmup()
# Trigger cuda graph capture
self.model_runner.capture_model()
Expand Down
Loading