handling comments

manickavela29 · manickavela29 · commit 5ea4fdc7b482 · 2025-11-03T04:00:01.000Z
diff --git a/python/ray/serve/_private/request_router/request_router.py b/python/ray/serve/_private/request_router/request_router.py
@@ -217,6 +217,11 @@ def __init__(self, *args, **kwargs):
         
         # Batching-aware routing: track pending requests by model ID for better batching
         self._pending_requests_by_model_id: DefaultDict[str, List] = defaultdict(list)
+        # Counters for efficient cleanup
+        self._pending_requests_added_since_cleanup = 0
+        self._last_cleanup_time = time.time()
+        self._cleanup_threshold = 50  # Cleanup after 50 new requests
+        self._cleanup_interval = 10.0  # Cleanup every 10 seconds
 
     def _get_pending_request_matching_multiplexed_model_id(
         self,
@@ -239,21 +244,47 @@ def _track_pending_request_by_model_id(self, pending_request: PendingRequest):
         if pending_request.metadata.multiplexed_model_id:
             model_id = pending_request.metadata.multiplexed_model_id
             self._pending_requests_by_model_id[model_id].append(pending_request)
+            self._pending_requests_added_since_cleanup += 1
 
     def _get_pending_requests_for_model(self, model_id: str) -> List[PendingRequest]:
         """Get all pending requests for a specific model ID."""
-        return [pr for pr in self._pending_requests_by_model_id[model_id] 
-                if not pr.future.done()]
+        # Filter out completed requests on-the-fly for immediate use
+        active_requests = [pr for pr in self._pending_requests_by_model_id[model_id] 
+                          if not pr.future.done()]
+        return active_requests
+
+    def _should_cleanup_pending_requests(self) -> bool:
+        """Determine if we should perform cleanup based on counters and time."""
+        return (self._pending_requests_added_since_cleanup >= self._cleanup_threshold or 
+                (time.time() - self._last_cleanup_time) >= self._cleanup_interval)
 
     def _cleanup_completed_pending_requests(self):
-        """Clean up completed requests from model ID tracking."""
+        """Clean up completed requests from model ID tracking efficiently."""
+        # Only cleanup if we've accumulated enough requests or enough time has passed
+        if not self._should_cleanup_pending_requests():
+            return
+            
+        cleanup_start = time.time()
+        total_requests_before = sum(len(requests) for requests in self._pending_requests_by_model_id.values())
+        
         for model_id in list(self._pending_requests_by_model_id.keys()):
             self._pending_requests_by_model_id[model_id] = [
                 pr for pr in self._pending_requests_by_model_id[model_id]
                 if not pr.future.done()
             ]
             if not self._pending_requests_by_model_id[model_id]:
                 del self._pending_requests_by_model_id[model_id]
+        
+        total_requests_after = sum(len(requests) for requests in self._pending_requests_by_model_id.values())
+        cleanup_time = time.time() - cleanup_start
+        
+        # Reset counters
+        self._pending_requests_added_since_cleanup = 0
+        self._last_cleanup_time = time.time()
+        
+        if total_requests_before != total_requests_after:
+            logger.debug(f"Cleaned up {total_requests_before - total_requests_after} completed requests "
+                        f"in {cleanup_time:.3f}s, {total_requests_after} active requests remaining")
 
     def _update_multiplexed_model_ids_with_replicas(
         self, replicas: List[RunningReplica]
@@ -349,9 +380,31 @@ def apply_multiplex_routing(
             if candidate_replica_ids and multiplexed_model_id:
                 pending_for_model = self._get_pending_requests_for_model(multiplexed_model_id)
                 if len(pending_for_model) > 1:  # Multiple requests for same model
-                    # Prefer replicas that are likely processing this model
-                    logger.debug(f"Found {len(pending_for_model)} pending requests for model {multiplexed_model_id}, "
-                               f"prioritizing batching-friendly routing")
+                    # Find replicas that already have pending requests for this model
+                    batching_friendly_replicas = set()
+                    
+                    for pending_req in pending_for_model:
+                        # Check if this request has been assigned to a replica
+                        if (pending_req.future.done() and 
+                            not pending_req.future.cancelled() and
+                            not pending_req.future.exception()):
+                            try:
+                                assigned_replica = pending_req.future.result()
+                                if (hasattr(assigned_replica, 'replica_id') and
+                                    assigned_replica.replica_id in candidate_replica_ids):
+                                    batching_friendly_replicas.add(assigned_replica.replica_id)
+                            except Exception:
+                                # Future might not have replica result, skip
+                                pass
+                    
+                    # If we found replicas with pending requests for this model, prioritize them
+                    if batching_friendly_replicas:
+                        candidate_replica_ids = batching_friendly_replicas
+                        logger.debug(f"Found {len(pending_for_model)} pending requests for model {multiplexed_model_id}, "
+                                   f"prioritizing {len(batching_friendly_replicas)} batching-friendly replicas")
+                    else:
+                        logger.debug(f"Found {len(pending_for_model)} pending requests for model {multiplexed_model_id}, "
+                                   f"but no batching-friendly replicas found in candidates")
             
             if (
                 not candidate_replica_ids
diff --git a/python/ray/serve/multiplex.py b/python/ray/serve/multiplex.py
@@ -160,9 +160,11 @@ async def model_batch_handler(batch_requests: List[Any]) -> List[Any]:
                 Returns:
                     List of results corresponding to each input.
                 """
+                # Re-check model availability at processing time to handle race conditions
                 model = self.models.get(model_id)
                 if model is None:
-                    raise RuntimeError(f"Model {model_id} not loaded")
+                    # Model was evicted, raise an exception that will cancel pending requests
+                    raise RuntimeError(f"Model {model_id} was evicted during batch processing")
                 
                 # Try to use batch_predict method if available
                 if hasattr(model, 'batch_predict'):
@@ -192,6 +194,124 @@ async def model_batch_handler(batch_requests: List[Any]) -> List[Any]:
         
         return self._model_batch_queues[model_id]
 
+    async def _shutdown_batch_queue(self, batch_queue_wrapper: _LazyBatchQueueWrapper, model_id: str):
+        """Gracefully shutdown a batch queue by canceling pending requests and background tasks."""
+        if batch_queue_wrapper._queue is None:
+            # Queue was never initialized, nothing to clean up
+            return
+            
+        batch_queue = batch_queue_wrapper._queue
+        
+        # Cancel the background processing task if it exists
+        if hasattr(batch_queue, '_handle_batch_task') and batch_queue._handle_batch_task:
+            batch_queue._handle_batch_task.cancel()
+            try:
+                await batch_queue._handle_batch_task
+            except asyncio.CancelledError:
+                pass  # Expected when cancelling
+        
+        # Cancel all pending requests in the queue
+        pending_requests = []
+        try:
+            while True:
+                try:
+                    request = batch_queue.queue.get_nowait()
+                    pending_requests.append(request)
+                except asyncio.QueueEmpty:
+                    break
+        except Exception:
+            pass  # Queue might be closed or corrupted
+        
+        # Handle pending requests gracefully - try to reassign rather than fail
+        reassigned_count = 0
+        failed_count = 0
+        
+        for request in pending_requests:
+            if not request.future.done():
+                try:
+                    # Try to reassign the request back to the routing system
+                    if await self._try_reassign_request(request, model_id):
+                        reassigned_count += 1
+                    else:
+                        # If reassignment fails, set a descriptive error
+                        request.future.set_exception(
+                            RuntimeError(f"Model {model_id} was evicted and could not be reassigned")
+                        )
+                        failed_count += 1
+                except Exception:
+                    # Future might already be done or other error, count as failed
+                    failed_count += 1
+        
+        logger.info(f"Shutdown batch queue for model {model_id}: reassigned {reassigned_count}, failed {failed_count} pending requests")
+
+    async def _try_reassign_request(self, request: _SingleRequest, model_id: str) -> bool:
+        """Try to reassign a pending request back to the routing system.
+        
+        Args:
+            request: The pending request to reassign
+            model_id: The model ID that was evicted
+            
+        Returns:
+            True if request was successfully reassigned, False otherwise
+        """
+        try:
+            # Extract the original input from the flattened args
+            if len(request.flattened_args) >= 2 and request.flattened_args[0] == DUMMY_TYPE:
+                original_input = request.flattened_args[1]
+            else:
+                # Fallback if format is unexpected
+                return False
+            
+            # Check if we have retry attempts left (prevent infinite loops)
+            retry_count = getattr(request, '_retry_count', 0)
+            if retry_count >= 2:  # Max 2 retries
+                return False
+            
+            # Create a new async task to retry the request with backoff
+            async def retry_request():
+                try:
+                    # Add retry count to track attempts
+                    setattr(request, '_retry_count', retry_count + 1)
+                    
+                    # Exponential backoff: wait longer for each retry
+                    backoff_time = 0.01 * (2 ** retry_count)
+                    await asyncio.sleep(backoff_time)
+                    
+                    # Try to process the request again - this will go through the full
+                    # model loading process, potentially reloading on this replica
+                    # Note: We call predict directly rather than batched_inference to avoid
+                    # potential batching complications during retry
+                    if self.enable_batching:
+                        # For batching case, try individual prediction as fallback
+                        model = await self.load_model(model_id)
+                        if hasattr(model, 'predict'):
+                            result = await model.predict(original_input)
+                        elif callable(model):
+                            result = await model(original_input)
+                        else:
+                            raise RuntimeError(f"Model {model_id} is not callable and has no predict method")
+                    else:
+                        result = await self.predict(original_input, model_id)
+                    
+                    # Set the result on the original future
+                    if not request.future.done():
+                        request.future.set_result(result)
+                    
+                except Exception as e:
+                    # If retry fails, set the exception on the original future
+                    if not request.future.done():
+                        request.future.set_exception(
+                            RuntimeError(f"Model {model_id} evicted, retry failed: {str(e)}")
+                        )
+            
+            # Start the retry task in the background
+            asyncio.create_task(retry_request())
+            return True
+            
+        except Exception as e:
+            logger.debug(f"Failed to reassign request for model {model_id}: {e}")
+            return False
+
     async def batched_inference(self, model_id: str, request: Any) -> Any:
         """Perform batched inference on a specific model."""
         if not self.enable_batching:
@@ -292,6 +412,14 @@ async def shutdown(self):
                 logger.exception(
                     f"Failed to unload model. Error: {e}",
                 )
+        
+        # Clean up any remaining batch queues
+        for model_id, batch_queue_wrapper in list(self._model_batch_queues.items()):
+            try:
+                await self._shutdown_batch_queue(batch_queue_wrapper, model_id)
+            except Exception as e:
+                logger.exception(f"Failed to shutdown batch queue for model {model_id}. Error: {e}")
+        self._model_batch_queues.clear()
 
     async def load_model(self, model_id: str) -> Any:
         """Load the model if it is not loaded yet, and return
@@ -373,8 +501,10 @@ async def unload_model_lru(self) -> None:
         model_id, model = self.models.popitem(last=False)
         logger.info(f"Unloading model '{model_id}'.")
 
-        # Clean up the batch queue for this model if it exists
+        # Gracefully shutdown the batch queue for this model if it exists
         if model_id in self._model_batch_queues:
+            batch_queue_wrapper = self._model_batch_queues[model_id]
+            await self._shutdown_batch_queue(batch_queue_wrapper, model_id)
             del self._model_batch_queues[model_id]
 
         # If the model has __del__ attribute, call it.
diff --git a/python/ray/serve/tests/test_multiplex_batching_router.py b/python/ray/serve/tests/test_multiplex_batching_router.py
@@ -200,22 +200,24 @@ async def load_model(model_id: str):
         # Load model first
         model = await wrapper_batched.load_model("batched_model")
         
-        # Send concurrent requests to same model using the model directly
+        # Send concurrent requests to the wrapper to test batching mechanism
         start_time = time.time()
         tasks = []
         for i in range(10):
-            task = model.batch_predict([f"data_{i}"])
+            # Use wrapper.predict() to test the actual batching mechanism
+            task = wrapper_batched.predict(f"data_{i}", "batched_model")
             tasks.append(task)
         
-        results_nested = await asyncio.gather(*tasks)
-        # Flatten results since batch_predict returns lists
-        results = [item for sublist in results_nested for item in sublist]
+        results = await asyncio.gather(*tasks)
         batched_time = time.time() - start_time
         
-        # Check the model's batch predict was called
+        # Check that batch predict was called (indicating batching worked)
         assert model.batch_predict_count > 0, "Batch predict should be called"
         assert len(results) == 10, "All requests should complete"
         
+        # Verify results are correct format - should be from batch_predict
+        assert all("batch_batched_model" in result for result in results), f"Expected batch results, got: {results[:3]}"
+        
         # Test without batching for comparison
         TrackableModel.reset_tracking()
         
diff --git a/python/ray/serve/tests/test_multiplex_batching_utils.py b/python/ray/serve/tests/test_multiplex_batching_utils.py