fix: add a session status synchronization to help model_agent manage guided processors

windreamer · windreamer · commit fc0fbcef306f · 2025-10-11T14:05:50.000+08:00
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -382,6 +382,8 @@ def __init__(self,
                                        dtype=engine_config.dtype)
         self.executor.init()
 
+        self.session_to_cleanup = []
+
         # strategies
         self.strategy_factory = build_strategy_factory(self.model_config, self.executor.misc_config)
         self.sampling_strategy = self.strategy_factory.build_sampling_strategy()
@@ -551,7 +553,7 @@ def _on_end_session(self, reqs: List[Request], **kwargs):
                 if len(msgs) > 0 and msgs[0].preserve_cache:
                     self.scheduler._set_message_status(msgs[0], MessageStatus.TO_BE_MIGRATED)
                 else:
-                    self.scheduler.end_session(session_id)
+                    self.end_session(session_id)
                 resp_type = ResponseType.SUCCESS
             if resp:
                 self._response(req.resp, resp_type)
@@ -912,6 +914,15 @@ def __need_logits(seqs: SeqList):
         stopping_criteria = self.model_agent_strategy.make_stopping_criteria(running)
 
         sync_long_context = inputs.input_ids.numel() > self.cache_config.max_prefill_token_num
+
+        session_ctx = [{
+            'session_id': seq.session.session_id,
+            'seq_id': seq.seq_id,
+        } for seq in running]
+
+        session_to_cleanup = self.session_to_cleanup
+        self.session_to_cleanup = []
+
         return dict(
             running=running,
             inputs=inputs,
@@ -924,6 +935,8 @@ def __need_logits(seqs: SeqList):
             is_dummy=False,
             sync_long_context=sync_long_context,
             extra_inputs=extra_inputs,
+            session_ctx=session_ctx,
+            session_to_cleanup=session_to_cleanup,
         )
 
     async def _await_forward_event(self, forward_event: asyncio.Event):
@@ -1237,6 +1250,7 @@ def start_loop(self):
     def end_session(self, session_id: int):
         """End session."""
         if session_id in self.scheduler.sessions:
+            self.session_to_cleanup.append(session_id)
             self.scheduler.end_session(session_id)
             return True
         return False
diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py
@@ -2,14 +2,13 @@
 import copy
 import json
 import logging
-from functools import lru_cache
 from typing import Optional
 
 import torch
 import xgrammar as xgr
 from transformers import PreTrainedTokenizerBase
 
-logger = logging.getLogger('guided_process')
+logger = logging.getLogger('lmdeploy')
 
 
 class BaseLogitsProcessor:
@@ -70,18 +69,31 @@ def __init__(self, schema: str, tokenizer: PreTrainedTokenizerBase, vocab_size_p
         super().__init__(compiled, tokenizer_info)
 
 
-@lru_cache(maxsize=32)
-def _get_guided_logits_processor(guide: str,
+_guided_processors = {}
+
+
+def _get_guided_logits_processor(session_id: int,
+                                 seq_id: int,
+                                 guide: str,
                                  tokenizer: PreTrainedTokenizerBase,
                                  type: str,
                                  vocab_size_padded: Optional[int] = None):
-    try:
-        if type == 'json_schema':
-            return JSONLogitsProcessor(guide, tokenizer, vocab_size_padded)
-        elif type == 'regex_schema':
-            return RegexLogitsProcessor(guide, tokenizer, vocab_size_padded)
-        else:
-            return None
-    except Exception as e:
-        logger.error(e)
-        raise
+    if session_id in _guided_processors:
+        session_dict = _guided_processors[session_id]
+        if seq_id in session_dict:
+            processor = session_dict[seq_id]
+            return processor
+
+    if type == 'json_schema':
+        processor = JSONLogitsProcessor(guide, tokenizer, vocab_size_padded)
+    elif type == 'regex_schema':
+        processor = RegexLogitsProcessor(guide, tokenizer, vocab_size_padded)
+    else:
+        assert False, f'Do not support schema type {type}'
+
+    _guided_processors.setdefault(session_id, {})[seq_id] = processor
+    return processor
+
+
+def _remove_guided_logtis_processor(session_id: int):
+    del _guided_processors[session_id]
diff --git a/lmdeploy/pytorch/engine/logits_process.py b/lmdeploy/pytorch/engine/logits_process.py
@@ -2,7 +2,7 @@
 import asyncio
 import json
 from dataclasses import dataclass, fields
-from typing import Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
@@ -78,7 +78,8 @@ def _multinomial_sampling(scores: torch.Tensor,
     return multinomial_sampling(scores, seeds, offsets, indices)
 
 
-def _get_guided_processors(response_formats: Tuple[Dict], tokenizer: object, vocab_size_padded: int):
+def _get_guided_processors(response_formats: Tuple[Dict], tokenizer: object, vocab_size_padded: int,
+                           session_ctx: List[Dict[str, Any]]):
     processors = {}
     for i, _format in enumerate(response_formats):
         if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
@@ -98,8 +99,12 @@ def _get_guided_processors(response_formats: Tuple[Dict], tokenizer: object, voc
             else:
                 raise ValueError(f"unsupported format type: {_format['type']}")
 
+            session_id = session_ctx[i]['session_id']
+            seq_id = session_ctx[i]['seq_id']
+
             from .guided_process import _get_guided_logits_processor
-            processors[i] = _get_guided_logits_processor(schema, tokenizer, _format['type'], vocab_size_padded)
+            processors[i] = _get_guided_logits_processor(session_id, seq_id, schema, tokenizer, _format['type'],
+                                                         vocab_size_padded)
 
     return processors
 
@@ -154,17 +159,20 @@ def _apply_custom_logits_processors(batched_logits_processors, all_ids, logits):
 class FusedLogitsProcessor:
     """Custom logits processor."""
 
-    def __init__(self,
-                 sampling_inputs: SamplingInputs,
-                 tokenizer: Optional[Tokenizer] = None,
-                 sampling_vocab_size: Optional[int] = None,
-                 logprobs_mode: Optional[str] = None):
+    def __init__(
+        self,
+        sampling_inputs: SamplingInputs,
+        tokenizer: Optional[Tokenizer] = None,
+        sampling_vocab_size: Optional[int] = None,
+        logprobs_mode: Optional[str] = None,
+        session_ctx: Optional[List[Dict[str, Any]]] = None,
+    ):
         self.sampling_inputs: SamplingInputs = sampling_inputs
         self.tokenizer = tokenizer
         self.sampling_vocab_size = sampling_vocab_size
         self.logprobs_mode = logprobs_mode
         self.guided_processors = _get_guided_processors(sampling_inputs.response_formats, tokenizer,
-                                                        sampling_vocab_size)
+                                                        sampling_vocab_size, session_ctx)
 
     async def _wait_stream_once(self):
         """Wait stream once."""
@@ -299,3 +307,9 @@ def compute_logprobs(self, raw_logprobs: torch.Tensor, token_ids: torch.LongTens
             indices = torch.cat([indices, topk_indices], dim=-1)
 
         return logprobs, indices.to(torch.int32)
+
+    @staticmethod
+    def cleanup_sessions(session_ids: List[int]):
+        from .guided_process import _remove_guided_logtis_processor
+        for session_id in session_ids:
+            _remove_guided_logtis_processor(session_id)
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -533,16 +533,20 @@ async def __long_context_single_forward(new_inputs, max_seqlen: int):
         ret['logits'] = logits
         return ret
 
-    async def async_sampling_logits(self, logits: torch.Tensor, sampling_inputs: SamplingInputs, inputs: ModelInputs):
+    async def async_sampling_logits(self, logits: torch.Tensor, sampling_inputs: SamplingInputs, inputs: ModelInputs,
+                                    session_ctx: List[Dict[str, Any]]):
         """Sampling logits."""
 
         # record function does not support async function
         # so we can not decorate it on async_sampling_logits
         with record_function('sampling_logits'):
-            logits_processor = FusedLogitsProcessor(sampling_inputs,
-                                                    self.tokenizer,
-                                                    sampling_vocab_size=self.sampling_vocab_size,
-                                                    logprobs_mode=self.misc_config.logprobs_mode)
+            logits_processor = FusedLogitsProcessor(
+                sampling_inputs,
+                self.tokenizer,
+                sampling_vocab_size=self.sampling_vocab_size,
+                logprobs_mode=self.misc_config.logprobs_mode,
+                session_ctx=session_ctx,
+            )
             origin_logits = logits
             logits, raw_logprobs = await logits_processor(origin_logits)
             next_token_ids = logits_processor.sampling(logits)
@@ -586,6 +590,8 @@ async def _async_step_background(
         is_dummy: bool = False,
         sync_long_context: bool = False,
         extra_inputs: ExtraInputs = None,
+        session_ctx: List[Dict[str, Any]] = None,
+        session_to_cleanup: List[int] = None,
     ):
         """Asyc forward task."""
         dist_ctx = get_dist_manager().current_context()
@@ -678,6 +684,9 @@ async def __prepare_dp():
 
         need_output = dp > 1 or rank % tp == 0
 
+        if session_to_cleanup:
+            self.cleanup_sessions(session_to_cleanup)
+
         # skip dummy forward.
         if is_all_dummy:
             logger.debug(f'<ForwardTask> rank[{rank}]: all inputs are dummy, skip forward.')
@@ -709,7 +718,8 @@ async def __prepare_dp():
             if need_output:
                 logger.debug(f'<ForwardTask> rank[{rank}]: Sampling [{idx}].')
                 # sampling
-                next_token_ids, logprobs = await self.async_sampling_logits(last_logits, sampling_inputs, inputs)
+                next_token_ids, logprobs = await self.async_sampling_logits(last_logits, sampling_inputs, inputs,
+                                                                            session_ctx)
 
                 with self._broadcast_next_token(next_token_ids, dist_ctx, enable=need_broadcast_next):
                     logger.debug(f'<ForwardTask> rank[{rank}]: synchronize token ids [{idx}]')
@@ -1062,6 +1072,9 @@ def release(self):
         self.cache_engine = None
         torch.cuda.empty_cache()
 
+    def cleanup_sessions(self, session_ids: List[int]):
+        FusedLogitsProcessor.cleanup_sessions(session_ids)
+
 
 class DefaultForwardInputsMaker:
     """Default forward inputs maker."""