PaddlePaddle · gongshaotian · Oct 16, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 16, 2025
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -133,8 +133,9 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
             self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE
             self.cuda_graph_manager.batch_size = entry.real_shape
             entry.captured = True
-            with self.cuda_graph_manager.run_impl_guard():
-                entry.runnable(**kwargs)
+            with capture_custom_allreduce():
+                with self.cuda_graph_manager.run_impl_guard():
+                    entry.runnable(**kwargs)
 
         # Replay
         self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -1575,6 +1575,7 @@ def _update_chunked_prefill(self, tasks):
                 self.proposer.update_task_chunk_prefill(task)
             task.chunk_idx += 1
 
+    @sot_warmup_guard(True)
 def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]: 
     """The main allreduce API that provides support for cuda graph.""" 
     if self.capturing: 
         lib = cuda_wrapper.CudaRTLibrary() 
         stream = paddle.device.current_stream() 
         stream_capturing = lib.cudaStreamIsCapturing(stream) 
         if stream_capturing.value == 1: 
             # 1 is cudaStreamCaptureStatusActive: The stream is capturing. 
             return self.all_reduce(input, input, registered=True) 
         else: 
             # If warm up, mimic the allocation pattern since custom 
             # allreduce is out-of-place. 
             return paddle.empty_like(input) 
     else: 
         return self.all_reduce(input, input, registered=False) 
 def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]: 
     """The main allreduce API that provides support for cuda graph.""" 
     if self.capturing: 
         lib = cuda_wrapper.CudaRTLibrary() 
         stream = paddle.device.current_stream() 
         stream_capturing = lib.cudaStreamIsCapturing(stream) 
         if stream_capturing.value == 1: 
             # 1 is cudaStreamCaptureStatusActive: The stream is capturing. 
             return self.all_reduce(input, input, registered=True) 
         else: 
             # If warm up, mimic the allocation pattern since custom 
             # allreduce is out-of-place. 
             return paddle.empty_like(input) 
     else: 
         return self.all_reduce(input, input, registered=False) 
     def capture_model(self) -> None:
         """
         Trigger CUDA Graph capture for all shapes in cuda graph capture list

diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
@@ -207,7 +207,7 @@ def graph_optimize_and_warm_up_model(self) -> None:
         """
         Perform the warm-up and the graph optimization
         """
-        if self.fd_config.graph_opt_config.graph_opt_level >= 1:
+        if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph:
             self.model_runner.sot_warmup()
         # Trigger cuda graph capture
         self.model_runner.capture_model()