From 2f3134c29500ee4f783094c3256effabbea3e562 Mon Sep 17 00:00:00 2001 From: DrRyanHuang Date: Tue, 14 Oct 2025 10:47:31 +0800 Subject: [PATCH] adapt custom all reduce for sot --- .../graph_optimization/cudagraph_piecewise_backend.py | 5 +++-- fastdeploy/worker/gpu_model_runner.py | 1 + fastdeploy/worker/gpu_worker.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 6341d3d71d..863ab0a443 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -133,8 +133,9 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs): self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE self.cuda_graph_manager.batch_size = entry.real_shape entry.captured = True - with self.cuda_graph_manager.run_impl_guard(): - entry.runnable(**kwargs) + with capture_custom_allreduce(): + with self.cuda_graph_manager.run_impl_guard(): + entry.runnable(**kwargs) # Replay self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8d1be796f4..c9bec78d9f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1555,6 +1555,7 @@ def _update_chunked_prefill(self, tasks): self.proposer.update_task_chunk_prefill(task) task.chunk_idx += 1 + @sot_warmup_guard(True) def capture_model(self) -> None: """ Trigger CUDA Graph capture for all shapes in cuda graph capture list diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 601efd16b5..8c72cfc751 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -207,7 +207,7 @@ def graph_optimize_and_warm_up_model(self) -> None: """ Perform the warm-up and the graph optimization """ - if self.fd_config.graph_opt_config.graph_opt_level >= 1: + if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph: self.model_runner.sot_warmup() # Trigger cuda graph capture self.model_runner.capture_model()