From 2f3134c29500ee4f783094c3256effabbea3e562 Mon Sep 17 00:00:00 2001
From: DrRyanHuang <zihaohuang@aliyun.com>
Date: Tue, 14 Oct 2025 10:47:31 +0800
Subject: [PATCH] adapt custom all reduce for sot

---
 .../graph_optimization/cudagraph_piecewise_backend.py        | 5 +++--
 fastdeploy/worker/gpu_model_runner.py                        | 1 +
 fastdeploy/worker/gpu_worker.py                              | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
index 6341d3d71d..863ab0a443 100644
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -133,8 +133,9 @@ def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
             self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE
             self.cuda_graph_manager.batch_size = entry.real_shape
             entry.captured = True
-            with self.cuda_graph_manager.run_impl_guard():
-                entry.runnable(**kwargs)
+            with capture_custom_allreduce():
+                with self.cuda_graph_manager.run_impl_guard():
+                    entry.runnable(**kwargs)
 
         # Replay
         self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 8d1be796f4..c9bec78d9f 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1555,6 +1555,7 @@ def _update_chunked_prefill(self, tasks):
                 self.proposer.update_task_chunk_prefill(task)
             task.chunk_idx += 1
 
+    @sot_warmup_guard(True)
     def capture_model(self) -> None:
         """
         Trigger CUDA Graph capture for all shapes in cuda graph capture list
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
index 601efd16b5..8c72cfc751 100644
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -207,7 +207,7 @@ def graph_optimize_and_warm_up_model(self) -> None:
         """
         Perform the warm-up and the graph optimization
         """
-        if self.fd_config.graph_opt_config.graph_opt_level >= 1:
+        if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph:
             self.model_runner.sot_warmup()
         # Trigger cuda graph capture
         self.model_runner.capture_model()