ray-project
diff --git a/‎python/ray/serve/_private/request_router/request_router.py‎
Lines changed: 46 additions & 0 deletions b/‎python/ray/serve/_private/request_router/request_router.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎python/ray/serve/api.py‎
Lines changed: 18 additions & 2 deletions b/‎python/ray/serve/api.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎python/ray/serve/multiplex.py‎
Lines changed: 138 additions & 5 deletions b/‎python/ray/serve/multiplex.py‎
Lines changed: 138 additions & 5 deletions
@@ -196,6 +196,9 @@ class MultiplexMixin:
     It adds necessary attributes and methods to keep track of multiplexed
     model IDs and offer the helpers to apply multiplex routing and rank
     replicas based on multiplexed model IDs.
+    
+    Now supports batching-aware routing to group requests by model ID
+    for optimal batching performance.
     """
 
     def __init__(self, *args, **kwargs):
@@ -211,6 +214,9 @@ def __init__(self, *args, **kwargs):
         self._multiplexed_model_id_fallback_match: Set[str] = set()
         self._replica_id_set: Set[ReplicaID] = set()
         self._replicas: Dict[ReplicaID, RunningReplica] = {}
+        
+        # Batching-aware routing: track pending requests by model ID for better batching
+        self._pending_requests_by_model_id: DefaultDict[str, List] = defaultdict(list)
 
     def _get_pending_request_matching_multiplexed_model_id(
         self,
@@ -228,6 +234,27 @@ def _get_pending_request_matching_multiplexed_model_id(
             ):
                 return pr
 
+    def _track_pending_request_by_model_id(self, pending_request: PendingRequest):
+        """Track pending requests by model ID for batching-aware routing."""
+        if pending_request.metadata.multiplexed_model_id:
+            model_id = pending_request.metadata.multiplexed_model_id
+            self._pending_requests_by_model_id[model_id].append(pending_request)
+
+    def _get_pending_requests_for_model(self, model_id: str) -> List[PendingRequest]:
+        """Get all pending requests for a specific model ID."""
+        return [pr for pr in self._pending_requests_by_model_id[model_id] 
+                if not pr.future.done()]
+
+    def _cleanup_completed_pending_requests(self):
+        """Clean up completed requests from model ID tracking."""
+        for model_id in list(self._pending_requests_by_model_id.keys()):
+            self._pending_requests_by_model_id[model_id] = [
+                pr for pr in self._pending_requests_by_model_id[model_id]
+                if not pr.future.done()
+            ]
+            if not self._pending_requests_by_model_id[model_id]:
+                del self._pending_requests_by_model_id[model_id]
+
     def _update_multiplexed_model_ids_with_replicas(
         self, replicas: List[RunningReplica]
     ):
@@ -280,6 +307,9 @@ def apply_multiplex_routing(
         then the replicas with the fewest multiplexed models, and finally all
         replicas.
 
+        Enhanced with batching-aware routing to prioritize replicas that already
+        have pending requests for the same model ID to improve batching efficiency.
+
         Args:
             pending_request: The pending request to be routed based on
                 multiplexed model policy.
@@ -291,6 +321,11 @@ def apply_multiplex_routing(
         if not pending_request:
             return self._replica_id_set
 
+        # Track this request for batching-aware routing
+        self._track_pending_request_by_model_id(pending_request)
+        # Clean up completed requests periodically
+        self._cleanup_completed_pending_requests()
+
         if not pending_request.routing_context.multiplexed_start_matching_time:
             pending_request.routing_context.multiplexed_start_matching_time = (
                 time.time()
@@ -300,13 +335,24 @@ def apply_multiplex_routing(
             pending_request.routing_context.multiplexed_start_matching_time
         )
         multiplexed_model_id = pending_request.metadata.multiplexed_model_id
+        
         if (
             time.time() - multiplexed_start_matching_time
             < self._multiplexed_matching_timeout
         ):
             candidate_replica_ids = self._multiplexed_model_id_to_replica_ids.get(
                 multiplexed_model_id, None
             )
+            
+            # Batching-aware enhancement: prioritize replicas with pending requests
+            # for the same model ID to improve batching efficiency
+            if candidate_replica_ids and multiplexed_model_id:
+                pending_for_model = self._get_pending_requests_for_model(multiplexed_model_id)
+                if len(pending_for_model) > 1:  # Multiple requests for same model
+                    # Prefer replicas that are likely processing this model
+                    logger.debug(f"Found {len(pending_for_model)} pending requests for model {multiplexed_model_id}, "
+                               f"prioritizing batching-friendly routing")
+            
             if (
                 not candidate_replica_ids
                 and multiplexed_model_id
 
@@ -751,7 +751,12 @@ def delete(name: str, _blocking: bool = True):
 
 @PublicAPI(stability="beta")
 def multiplexed(
-    func: Optional[Callable[..., Any]] = None, max_num_models_per_replica: int = 3
+    func: Optional[Callable[..., Any]] = None,
+    max_num_models_per_replica: int = 3,
+    enable_batching: bool = False,
+    max_batch_size: int = 10,
+    batch_wait_timeout_s: float = 0.01,
+    max_concurrent_batches: int = 1,
 ):
     """Wrap a callable or method used to load multiplexed models in a replica.
 
@@ -811,6 +816,11 @@ async def __call__(self, request):
             set it to a larger number if you have enough memory on
             the node resource, in opposite, you can set it to a smaller
             number if you want to save memory on the node resource.
+        enable_batching: whether to enable batching for model inference calls.
+            Default is False.
+        max_batch_size: maximum batch size for batched inference calls. Default is 10.
+        batch_wait_timeout_s: timeout for batching inference calls. Default is 0.01s.
+        max_concurrent_batches: maximum number of concurrent batches. Default is 1.
     """
 
     if func is not None:
@@ -875,7 +885,13 @@ async def _multiplex_wrapper(*args):
             # create a model multiplex wrapper and cache it in the multiplex object.
             if not hasattr(multiplex_object, multiplex_attr):
                 model_multiplex_wrapper = _ModelMultiplexWrapper(
-                    func, self, max_num_models_per_replica
+                    func,
+                    self,
+                    max_num_models_per_replica,
+                    enable_batching=enable_batching,
+                    max_batch_size=max_batch_size,
+                    batch_wait_timeout_s=batch_wait_timeout_s,
+                    max_concurrent_batches=max_concurrent_batches,
                 )
                 setattr(multiplex_object, multiplex_attr, model_multiplex_wrapper)
             else:
 
@@ -3,7 +3,7 @@
 import logging
 import time
 from collections import OrderedDict
-from typing import Any, Callable, List, Set
+from typing import Any, Callable, List, Set, Optional
 
 from ray.serve import metrics
 from ray.serve._private.common import ReplicaID, RequestRoutingInfo
@@ -15,6 +15,8 @@
 from ray.serve._private.metrics_utils import MetricsPusher
 from ray.serve._private.usage import ServeUsageTag
 from ray.serve.context import _get_global_client, _get_internal_replica_context
+from ray.serve.batching import _LazyBatchQueueWrapper, _SingleRequest
+from ray._common.signature import DUMMY_TYPE
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
@@ -39,16 +41,26 @@ class _ModelMultiplexWrapper:
     def __init__(
         self,
         model_load_func: Callable[[str], Any],
-        self_arg: Any,
-        max_num_models_per_replica: int,
+        self_arg: Any = None,
+        max_num_models_per_replica: int = 3,
+        enable_batching: bool = False,
+        max_batch_size: int = 10,
+        batch_wait_timeout_s: float = 0.01,
+        max_concurrent_batches: int = 1,
     ):
         """Initialize the model multiplexer.
         Args:
             model_load_func: the model load async function.
-            self_arg: self argument when model_load_func is class method.
+            self_arg: self argument when model_load_func is class method. Default is None
+                for standalone functions.
             max_num_models_per_replica: the maximum number of models to be loaded on the
                 current replica. If it is -1, there is no limit for the number of models
-                per replica.
+                per replica. Default is 3.
+            enable_batching: whether to enable batching for model inference calls.
+                Default is False.
+            max_batch_size: maximum batch size for batched inference calls. Default is 10.
+            batch_wait_timeout_s: timeout for batching inference calls. Default is 0.01s.
+            max_concurrent_batches: maximum number of concurrent batches. Default is 1.
         """
 
         ServeUsageTag.MULTIPLEXED_API_USED.record("1")
@@ -57,6 +69,15 @@ def __init__(
         self._func: Callable = model_load_func
         self.self_arg: Any = self_arg
         self.max_num_models_per_replica: int = max_num_models_per_replica
+        
+        # Batching configuration
+        self.enable_batching = enable_batching
+        self.max_batch_size = max_batch_size
+        self.batch_wait_timeout_s = batch_wait_timeout_s
+        self.max_concurrent_batches = max_concurrent_batches
+        
+        # Model-specific batch queues for inference batching
+        self._model_batch_queues: dict[str, _LazyBatchQueueWrapper] = {}
 
         # log MODEL_LOAD_LATENCY_BUCKET_MS
         logger.debug(f"MODEL_LOAD_LATENCY_BUCKET_MS: {MODEL_LOAD_LATENCY_BUCKETS_MS}")
@@ -123,6 +144,114 @@ def __init__(
         )
         self.metrics_pusher.start()
 
+    def _get_or_create_batch_queue(self, model_id: str) -> Optional[_LazyBatchQueueWrapper]:
+        """Get or create a batch queue for a specific model."""
+        if not self.enable_batching:
+            return None
+            
+        if model_id not in self._model_batch_queues:
+            # Create a batch handler for this specific model
+            async def model_batch_handler(batch_requests: List[Any]) -> List[Any]:
+                """Handle batched inference for a specific model.
+                
+                Args:
+                    batch_requests: List of input data items to process as a batch.
+                    
+                Returns:
+                    List of results corresponding to each input.
+                """
+                model = self.models.get(model_id)
+                if model is None:
+                    raise RuntimeError(f"Model {model_id} not loaded")
+                
+                # Try to use batch_predict method if available
+                if hasattr(model, 'batch_predict'):
+                    results = await model.batch_predict(batch_requests)
+                else:
+                    # Fallback to individual prediction calls
+                    results = []
+                    for request_data in batch_requests:
+                        if hasattr(model, 'predict'):
+                            result = await model.predict(request_data)
+                        elif callable(model):
+                            result = await model(request_data)
+                        else:
+                            raise RuntimeError(
+                                f"Model {model_id} is not callable and has no predict method"
+                            )
+                        results.append(result)
+                    
+                return results
+            
+            self._model_batch_queues[model_id] = _LazyBatchQueueWrapper(
+                max_batch_size=self.max_batch_size,
+                batch_wait_timeout_s=self.batch_wait_timeout_s,
+                max_concurrent_batches=self.max_concurrent_batches,
+                handle_batch_func=model_batch_handler,
+            )
+        
+        return self._model_batch_queues[model_id]
+
+    async def batched_inference(self, model_id: str, request: Any) -> Any:
+        """Perform batched inference on a specific model."""
+        if not self.enable_batching:
+            raise RuntimeError("Batching is not enabled for this multiplexer")
+        
+        # Ensure model is loaded first
+        await self.load_model(model_id)
+        
+        # Get the batch queue for this model
+        batch_queue = self._get_or_create_batch_queue(model_id)
+        if batch_queue is None:
+            raise RuntimeError("Failed to create batch queue")
+        
+        # Submit request to the batch queue using _SingleRequest format
+        import ray.serve.context as context
+        future = asyncio.get_event_loop().create_future()
+        request_context = context._get_serve_request_context()
+        
+        # Create _SingleRequest with flattened args using DUMMY_TYPE for positional args
+        # Format: [DUMMY_TYPE, arg1, DUMMY_TYPE, arg2, ...] for positional args
+        single_request = _SingleRequest(
+            self_arg=None,
+            flattened_args=[DUMMY_TYPE, request],
+            future=future,
+            request_context=request_context
+        )
+        
+        batch_queue.queue.put(single_request)
+        
+        return await future
+
+    async def predict(self, input_data: Any, model_id: str) -> Any:
+        """Convenience method for model prediction with optional batching.
+        
+        Args:
+            input_data: The input data to predict on.
+            model_id: The model ID to use for prediction.
+            
+        Returns:
+            The prediction result.
+        """
+        if self.enable_batching:
+            # Use batched inference
+            return await self.batched_inference(model_id, input_data)
+        else:
+            # Load model and call directly
+            model = await self.load_model(model_id)
+            
+            # Try different prediction methods
+            if hasattr(model, 'predict'):
+                result = await model.predict(input_data)
+            elif callable(model):
+                result = await model(input_data)
+            else:
+                raise RuntimeError(
+                    f"Model {model_id} is not callable and has no predict method"
+                )
+            
+            return result
+
     def _get_loading_and_loaded_model_ids(self) -> List[str]:
         """Get the model IDs of the loaded models & loading models in the replica.
         This is to push the model id information early to the controller, so that
@@ -244,6 +373,10 @@ async def unload_model_lru(self) -> None:
         model_id, model = self.models.popitem(last=False)
         logger.info(f"Unloading model '{model_id}'.")
 
+        # Clean up the batch queue for this model if it exists
+        if model_id in self._model_batch_queues:
+            del self._model_batch_queues[model_id]
+
         # If the model has __del__ attribute, call it.
         # This is to clean up the model resources eagerly.
         if hasattr(model, "__del__"):