InternLM · lvhan028 · Mar 26, 2025 · Mar 24, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -301,6 +301,7 @@ class PytorchEngineConfig:
     tp: int = 1
     dp: int = 1
     dp_rank: int = 0
+    ep: int = 1
     session_len: int = None
     max_batch_size: int = None
     cache_max_entry_count: float = 0.8

diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -131,7 +131,11 @@ def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, pas
         context = self.ctx_mgr.current_context()
         is_decoding = context.is_decoding
         num_tokens = input_ids.numel()
-        new_num_tokens = next_power_of_2(num_tokens)
+        meta = self.get_meta()
+        if meta.padding_batch_size is None:
+            new_num_tokens = next_power_of_2(num_tokens)
+        else:
+            new_num_tokens = next_power_of_2(meta.padding_batch_size)
         return (new_num_tokens, is_decoding)
 
     def __call__(self, **kwargs):

diff --git a/lmdeploy/pytorch/backends/graph_runner.py b/lmdeploy/pytorch/backends/graph_runner.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from dataclasses import dataclass
 from typing import List
 
 import torch
@@ -7,6 +8,11 @@
 from lmdeploy.pytorch.model_inputs import StepContext
 
 
+@dataclass
+class GraphRunnerMeta:
+    padding_batch_size: int = None
+
+
 class GraphRunner:
     """graph runner."""
 
@@ -18,6 +24,7 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
         self.model_config = model_config
         self.cache_config = cache_config
         self.backend_config = backend_config
+        self._runner_meta = GraphRunnerMeta()
 
     def __call__(self, **kwargs):
         """call graph runner forward."""
@@ -72,3 +79,7 @@ def get_input_processor(self):
     def reset(self):
         """remove all graphs to prevent hanging on exit."""
         pass
+
+    def get_meta(self):
+        """get graphrunner meta."""
+        return self._runner_meta
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -292,8 +292,8 @@ def __init__(self,
         tp = engine_config.tp
         dp = engine_config.dp
         dp_rank = engine_config.dp_rank
-        if dp > 1 and tp > 1 and not engine_config.eager_mode:
-            logger.warning('Enable eager mode on dp > 1.')
+        if engine_config.ep > 1 and not engine_config.eager_mode:
+            logger.warning('Enable eager mode on ep > 1.')
             # TODO: support eager with dp
             engine_config.eager_mode = True
 

diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -357,6 +357,11 @@ def __update_inputs(next_token_ids):
             if sampling_inputs.random_offsets is not None:
                 sampling_inputs.random_offsets += 1
 
+        async def __await_distworker(worker, timeout: float = 0.001):
+            while not worker.is_completed():
+                await asyncio.sleep(timeout)
+            worker.wait()
+
         # dist tools
         dist_ctx = get_dist_manager().current_context()
         rank = dist_ctx.rank
@@ -368,11 +373,18 @@ def __update_inputs(next_token_ids):
                     f'num_tokens={inputs.input_ids.size(-1)}')
 
         is_decoding = inputs.is_decoding
-        if dp > 1 and not is_decoding:
-            all_sync_flags = torch.tensor([False] * dp, device='cuda')
-            lc_handle = dist.all_gather_into_tensor(all_sync_flags,
-                                                    torch.tensor(sync_long_context, device='cuda'),
-                                                    async_op=True)
+        if dp > 1:
+            if is_decoding:
+                batch_size = inputs.seq_length.numel()
+                all_batch_sizes = torch.tensor([0] * dp, device='cuda')
+                lc_handle = dist.all_gather_into_tensor(all_batch_sizes,
+                                                        all_batch_sizes.new_tensor(batch_size),
+                                                        async_op=True)
+            else:
+                all_sync_flags = torch.tensor([False] * dp, device='cuda')
+                lc_handle = dist.all_gather_into_tensor(all_sync_flags,
+                                                        torch.tensor(sync_long_context, device='cuda'),
+                                                        async_op=True)
 
         non_blocking = True
         inputs = _try_to_cuda(inputs, non_blocking=non_blocking)
@@ -385,10 +397,19 @@ def __update_inputs(next_token_ids):
         self.stream.synchronize()
 
         if dp > 1:
-            if not is_decoding:
-                lc_handle.wait()
+            if is_decoding:
+                await __await_distworker(lc_handle)
+                padding_batch_size = all_batch_sizes.cpu().max().item()
+                meta = self.patched_model.get_meta()
+                meta.padding_batch_size = padding_batch_size
+                logger.debug(f'padding_batch_size={padding_batch_size}')
+            else:
+                await __await_distworker(lc_handle)
                 sync_long_context = all_sync_flags.any()
+                logger.debug(f'sync_long_context={sync_long_context}')
             inputs.build_dp_meta()
+        else:
+            sync_long_context = False
 
         need_output = dp > 1 or rank % tp == 0