InternLM
diff --git a/‎lmdeploy/pytorch/engine/engine.py‎
Lines changed: 1 addition & 13 deletions b/‎lmdeploy/pytorch/engine/engine.py‎
Lines changed: 1 addition & 13 deletions
diff --git a/‎lmdeploy/pytorch/engine/guided_process.py‎
Lines changed: 94 additions & 102 deletions b/‎lmdeploy/pytorch/engine/guided_process.py‎
Lines changed: 94 additions & 102 deletions
diff --git a/‎lmdeploy/pytorch/engine/logits_process.py‎
Lines changed: 16 additions & 48 deletions b/‎lmdeploy/pytorch/engine/logits_process.py‎
Lines changed: 16 additions & 48 deletions
@@ -382,8 +382,6 @@ def __init__(self,
                                        dtype=engine_config.dtype)
         self.executor.init()
 
-        self.session_to_cleanup = []
-
         # strategies
         self.strategy_factory = build_strategy_factory(self.model_config, self.executor.misc_config)
         self.sampling_strategy = self.strategy_factory.build_sampling_strategy()
@@ -915,14 +913,6 @@ def __need_logits(seqs: SeqList):
 
         sync_long_context = inputs.input_ids.numel() > self.cache_config.max_prefill_token_num
 
-        session_ctx = [{
-            'session_id': seq.session.session_id,
-            'seq_id': seq.seq_id,
-        } for seq in running]
-
-        session_to_cleanup = self.session_to_cleanup
-        self.session_to_cleanup = []
-
         return dict(
             running=running,
             inputs=inputs,
@@ -935,8 +925,6 @@ def __need_logits(seqs: SeqList):
             is_dummy=False,
             sync_long_context=sync_long_context,
             extra_inputs=extra_inputs,
-            session_ctx=session_ctx,
-            session_to_cleanup=session_to_cleanup,
         )
 
     async def _await_forward_event(self, forward_event: asyncio.Event):
@@ -1250,7 +1238,7 @@ def start_loop(self):
     def end_session(self, session_id: int):
         """End session."""
         if session_id in self.scheduler.sessions:
-            self.session_to_cleanup.append(session_id)
+            self.sampling_strategy.on_session_end(session_id)
             self.scheduler.end_session(session_id)
             return True
         return False
 
@@ -1,8 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import copy
 import json
 import logging
-from typing import Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 import xgrammar as xgr
@@ -11,103 +10,96 @@
 logger = logging.getLogger('lmdeploy')
 
 
-class BaseLogitsProcessor:
-    """Base logits processor that uses xgrammar matcher for guided decoding."""
-
-    def __init__(self, compiled_grammar: xgr.CompiledGrammar, tokenizer_info: xgr.TokenizerInfo):
-        self.matcher = xgr.GrammarMatcher(compiled_grammar, terminate_without_stop_token=True)
-
-    def fill_bitmap(self, guided_bitmask: torch.Tensor, index: int) -> None:
-        """Fill the bitmask for the next token prediction at given index."""
-        self.matcher.fill_next_token_bitmask(guided_bitmask, index)
-
-    def accept(self, token_id: int) -> bool:
-        """Update matcher state after a token is generated."""
-        return self.matcher.accept_token(token_id)
-
-    def reset(self):
-        """Reset matcher state for next generation."""
-        self.matcher.reset()
-
-
-class RegexLogitsProcessor(BaseLogitsProcessor):
-    """Regex-guided logits processor using xgrammar."""
-
-    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
-        tokenizer = copy.deepcopy(tokenizer)
-        if vocab_size_padded is None:
-            vocab_size_padded = tokenizer.vocab_size
-
-        tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)
-
-        compiler = xgr.GrammarCompiler(tokenizer_info)
-        compiled = compiler.compile_regex_grammar(regex_string)
-
-        super().__init__(compiled, tokenizer_info)
-
-
-class JSONLogitsProcessor(BaseLogitsProcessor):
-    """JSON-schema guided logits processor using xgrammar."""
-
-    def __init__(self, schema: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
-        tokenizer = copy.deepcopy(tokenizer)
-        tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)
-        if vocab_size_padded is None:
-            vocab_size_padded = tokenizer.vocab_size
-
-        compiler = xgr.GrammarCompiler(tokenizer_info)
-        if isinstance(schema, str):
-            schema = json.loads(schema)
-
-        assert isinstance(schema, dict)
-        compiled = compiler.compile_json_schema(schema)
-
-        super().__init__(compiled, tokenizer_info)
-
-
-_guided_processors = {}
-
-
-def _get_guided_logits_processor(session_id: int,
-                                 seq_id: int,
-                                 guide: str,
-                                 tokenizer: PreTrainedTokenizerBase,
-                                 type: str,
-                                 vocab_size_padded: Optional[int] = None):
-    if session_id in _guided_processors:
-        session_dict = _guided_processors[session_id]
-        if seq_id in session_dict:
-            processor = session_dict[seq_id]
-            return processor
-
-    if type == 'json_schema':
-        processor = JSONLogitsProcessor(guide, tokenizer, vocab_size_padded)
-    elif type == 'regex_schema':
-        processor = RegexLogitsProcessor(guide, tokenizer, vocab_size_padded)
-    else:
-        assert False, f'Do not support schema type {type}'
-
-    _guided_processors.setdefault(session_id, {})[seq_id] = processor
-    return processor
-
-
-def _remove_guided_logtis_processor(session_id: int):
-    if session_id in _guided_processors:
-        del _guided_processors[session_id]
-
-
-def _allocate_batched_bitmap(batch_size: int, vocab_size: int):
-    return xgr.allocate_token_bitmask(batch_size, vocab_size)
-
-
-def _apply_batched_bitmap(logits: torch.Tensor, guided_bitmask: torch.Tensor) -> None:
-    device = logits.device
-    dtype = logits.dtype
-
-    if device.type in {'cpu', 'cuda'}:
-        xgr.apply_token_bitmask_inplace(logits, guided_bitmask.to(device))
-    else:
-        cpu_logits = logits.cpu().float()
-        cpu_mask = guided_bitmask.cpu()
-        xgr.apply_token_bitmask_inplace(cpu_logits, cpu_mask)
-        logits.copy_(cpu_logits.to(device, dtype))
+class GuidedDecodingMangager:
+    processors = {}
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]):
+        if vocab_size is None:
+            vocab_size = tokenizer.vocab_size
+
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size)
+        self.compiler = xgr.GrammarCompiler(tokenizer_info)
+        self.vocab_size = vocab_size
+
+    def get_processors(self, session_ctx: List[Dict[str, Any]],
+                       response_formats: Tuple[Dict]) -> Dict[int, xgr.GrammarMatcher]:
+        processors = {}
+        for i, _format in enumerate(response_formats):
+            if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
+                if _format['type'] == 'json_schema':
+                    schema = _format['json_schema']
+                    if isinstance(schema, Dict):
+                        for key in ['json_schema', 'schema']:
+                            if key in schema:
+                                schema = json.dumps(schema[key], ensure_ascii=False)
+
+                    if not isinstance(schema, str):
+                        raise ValueError(f'Cannot parse schema {schema}. The schema must be '
+                                         'either a dictionary or a string that contains the'
+                                         ' JSON Schema specification')
+                elif _format['type'] == 'regex_schema':
+                    schema = _format.get('regex_schema', '')
+                else:
+                    raise ValueError(f"unsupported format type: {_format['type']}")
+
+                session_id = session_ctx[i]['session_id']
+                seq_id = session_ctx[i]['seq_id']
+
+                processors[i] = self.get_processor(session_id, seq_id, schema, _format['type'])
+
+        return processors
+
+    def get_processor(self, session_id: int, seq_id: int, schema: str, type: str) -> xgr.GrammarMatcher:
+        if session_id in self.processors:
+            session_dict = self.processors[session_id]
+            if seq_id in session_dict:
+                processor = session_dict[seq_id]
+                return processor
+
+        if type == 'json_schema':
+            if isinstance(schema, str):
+                schema = json.loads(schema)
+
+            assert isinstance(schema, dict)
+            compiled = self.compiler.compile_json_schema(schema)
+        elif type == 'regex_schema':
+            compiled = self.compiler.compile_regex_grammar(schema)
+        else:
+            assert False, f'Do not support schema type {type}'
+
+        processor = xgr.GrammarMatcher(compiled, terminate_without_stop_token=True)
+        self.processors.setdefault(session_id, {})[seq_id] = processor
+        logger.info(f'create guided processor for session_id={session_id}, seq_id={seq_id}, and '
+                    f'total_processors={len(self.processors)}')
+        return processor
+
+    def remove_processor(self, session_id: int):
+        if session_id in self.processors:
+            del self.processors[session_id]
+            logger.info(
+                f'delete guided processor for session_id={session_id}, and total_processors={len(self.processors)}')
+
+    def allocate_batched_bitmap(self, batch_size: int) -> torch.Tensor:
+        return xgr.allocate_token_bitmask(batch_size, self.vocab_size)
+
+    def fill_bitmap(self, processor: xgr.GrammarMatcher, guided_bitmask: torch.Tensor, index: int) -> None:
+        processor.fill_next_token_bitmask(guided_bitmask, index)
+
+    def accept_token(self, processor: xgr.GrammarMatcher, token: int) -> None:
+        processor.accept_token(token)
+
+    def apply_batched_bitmap(self, logits: torch.Tensor, guided_bitmask: torch.Tensor) -> None:
+        device = logits.device
+        dtype = logits.dtype
+
+        if device.type in {'cpu', 'cuda'}:
+            xgr.apply_token_bitmask_inplace(logits, guided_bitmask.to(device))
+        else:
+            cpu_logits = logits.cpu().float()
+            cpu_mask = guided_bitmask.cpu()
+            xgr.apply_token_bitmask_inplace(cpu_logits, cpu_mask)
+            logits.copy_(cpu_logits.to(device, dtype))
+
+    def clear(self) -> None:
+        self.processors.clear()
+        logger.info(f'clear guided processors, total_processors={len(self.processors)}')
@@ -1,15 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-import json
 from dataclasses import dataclass, fields
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
 from lmdeploy.messages import LogitsProcessor
-from lmdeploy.tokenizer import Tokenizer
 
 from ..messages import SchedulerSequence
+from .guided_process import GuidedDecodingMangager
 
 
 def _process_temperature_(scores: torch.Tensor, temperature: torch.Tensor):
@@ -78,37 +77,6 @@ def _multinomial_sampling(scores: torch.Tensor,
     return multinomial_sampling(scores, seeds, offsets, indices)
 
 
-def _get_guided_processors(response_formats: Tuple[Dict], tokenizer: object, vocab_size_padded: int,
-                           session_ctx: List[Dict[str, Any]]):
-    processors = {}
-    for i, _format in enumerate(response_formats):
-        if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
-            if _format['type'] == 'json_schema':
-                schema = _format['json_schema']
-                if isinstance(schema, Dict):
-                    for key in ['json_schema', 'schema']:
-                        if key in schema:
-                            schema = json.dumps(schema[key], ensure_ascii=False)
-
-                if not isinstance(schema, str):
-                    raise ValueError(f'Cannot parse schema {schema}. The schema must be '
-                                     'either a dictionary or a string that contains the'
-                                     ' JSON Schema specification')
-            elif _format['type'] == 'regex_schema':
-                schema = _format.get('regex_schema', '')
-            else:
-                raise ValueError(f"unsupported format type: {_format['type']}")
-
-            session_id = session_ctx[i]['session_id']
-            seq_id = session_ctx[i]['seq_id']
-
-            from .guided_process import _get_guided_logits_processor
-            processors[i] = _get_guided_logits_processor(session_id, seq_id, schema, tokenizer, _format['type'],
-                                                         vocab_size_padded)
-
-    return processors
-
-
 SeqList = List[SchedulerSequence]
 
 
@@ -133,6 +101,8 @@ class SamplingInputs:
     all_ids: Optional[torch.Tensor] = None
     num_ignore_eos: torch.Tensor = None
     batch_size: int = 0
+    session_ctx: Optional[List[Dict[str, Any]]] = None
+    session_to_cleanup: Optional[List[int]] = None
 
     def to_device(self, device: str, non_blocking: bool = False):
         """To device."""
@@ -162,17 +132,19 @@ class FusedLogitsProcessor:
     def __init__(
         self,
         sampling_inputs: SamplingInputs,
-        tokenizer: Optional[Tokenizer] = None,
         sampling_vocab_size: Optional[int] = None,
         logprobs_mode: Optional[str] = None,
-        session_ctx: Optional[List[Dict[str, Any]]] = None,
+        guided_decoding_manager: Optional[GuidedDecodingMangager] = None,
     ):
         self.sampling_inputs: SamplingInputs = sampling_inputs
-        self.tokenizer = tokenizer
         self.sampling_vocab_size = sampling_vocab_size
         self.logprobs_mode = logprobs_mode
-        self.guided_processors = _get_guided_processors(sampling_inputs.response_formats, tokenizer,
-                                                        sampling_vocab_size, session_ctx)
+        self.guided_decoding_manager = guided_decoding_manager
+        if sampling_inputs.session_to_cleanup:
+            self.cleanup_sessions(sampling_inputs.session_to_cleanup)
+
+        self.guided_processors = self.guided_decoding_manager.get_processors(sampling_inputs.session_ctx,
+                                                                             sampling_inputs.response_formats)
 
     async def _wait_stream_once(self):
         """Wait stream once."""
@@ -211,19 +183,17 @@ async def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
         all_ids = sampling_inputs.all_ids
         custom_logits_processors = self.sampling_inputs.logits_processors
         if self.guided_processors:
-            from .guided_process import _allocate_batched_bitmap, _apply_batched_bitmap
-
             if not hasattr(self, 'guided_bitmask'):
-                self.guided_bitmask = _allocate_batched_bitmap(len(scores), self.sampling_vocab_size)
+                self.guided_bitmask = self.guided_decoding_manager.allocate_batched_bitmap(len(scores))
 
             assert self.guided_bitmask is not None
             guided_bitmask = self.guided_bitmask
 
             await self._wait_stream_once()
             for i, processor in self.guided_processors.items():
-                processor.fill_bitmap(guided_bitmask, i)
+                self.guided_decoding_manager.fill_bitmap(processor, guided_bitmask, i)
 
-            _apply_batched_bitmap(scores, guided_bitmask)
+            self.guided_decoding_manager.apply_batched_bitmap(scores, guided_bitmask)
 
         if any(custom_logits_processors):
             await self._wait_stream_once()
@@ -298,7 +268,7 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
 
         if self.guided_processors:
             for i, processor in self.guided_processors.items():
-                processor.accept(result[i])
+                self.guided_decoding_manager.accept_token(processor, result[i])
 
         return result
 
@@ -318,8 +288,6 @@ def compute_logprobs(self, raw_logprobs: torch.Tensor, token_ids: torch.LongTens
 
         return logprobs, indices.to(torch.int32)
 
-    @staticmethod
-    def cleanup_sessions(session_ids: List[int]):
-        from .guided_process import _remove_guided_logtis_processor
+    def cleanup_sessions(self, session_ids: List[int]):
         for session_id in session_ids:
-            _remove_guided_logtis_processor(session_id)
+            self.guided_decoding_manager.remove_processor(session_id)