add batch routing logic to service + test case

DNXie · DNXie · commit 1c7efacc7df8 · 2025-09-25T10:55:24.000-07:00
diff --git a/src/forge/controller/service/__init__.py b/src/forge/controller/service/__init__.py
@@ -7,7 +7,7 @@
 from .interface import ServiceInterface, Session, SessionContext
 from .metrics import ServiceMetrics
 from .replica import Replica, ReplicaMetrics, ReplicaState
-from .router import BatchRouter, LeastLoadedRouter, RoundRobinRouter, SessionRouter
+from .router import LeastLoadedRouter, RoundRobinRouter, SessionRouter
 from .service import Service, ServiceActor, ServiceConfig
 
 __all__ = [
@@ -24,5 +24,4 @@
     "LeastLoadedRouter",
     "RoundRobinRouter",
     "SessionRouter",
-    "BatchRouter",
 ]
diff --git a/src/forge/controller/service/router.py b/src/forge/controller/service/router.py
@@ -4,9 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import asyncio
+
 import logging
-from typing import Callable, Dict, List, Optional
+from typing import Dict, List
 
 from .interface import Router
 from .replica import Replica
@@ -89,134 +89,3 @@ def get_replica(
             replica.idx,
         )
         return replica
-
-
-class BatchRouter(Router):
-    """
-    Router wrapper that batches routing decisions.
-    Uses an inner router to pick the replica for each batch.
-
-    Args:
-        inner_router: The underlying Router instance used to make routing decisions
-        batch_max_size: Maximum number of requests to collect in a single batch (default: 8)
-        batch_max_wait_s: Maximum time to wait before processing a batch in seconds (default: 0.01)
-
-    Example:
-        rr_router = RoundRobinRouter()
-        batch_router = BatchRouter(rr_router, batch_max_size=16, batch_max_wait_s=0.02)
-
-        replica = await batch_router.get_replica(healthy_replicas, sess_id, session_map)
-    """
-
-    def __init__(
-        self,
-        inner_router: Router,
-        batch_max_size: int = 8,
-        batch_max_wait_s: float = 0.01,
-        get_healthy_replicas: Optional[Callable[[], List["Replica"]]] = None,
-        session_map: Optional[Dict[str, int]] = None,
-    ):
-
-        self.inner_router = inner_router
-        self.batch_max_size = batch_max_size
-        self.batch_max_wait_s = batch_max_wait_s
-        self.get_healthy_replicas = get_healthy_replicas
-        self.session_map = session_map
-
-        # Internal queue for batching routing requests
-        self._queue: asyncio.Queue = asyncio.Queue()
-        self._running = True  # flag to control loop
-        # Background task that processes batches continuously
-        self._batch_task: asyncio.Task = asyncio.create_task(self._batch_loop())
-
-    async def _batch_loop(self):
-        """Background task that continuously processes batches of routing requests.
-
-        This is the core batching logic that runs in a separate asyncio task.
-        It collects requests from the queue and processes them in batches based
-        on size and time constraints.
-
-        The loop follows these steps:
-        1. Wait for the first request to start a new batch
-        2. Collect additional requests until batch_max_size or batch_max_wait_s is reached
-        3. Make a single routing decision for the entire batch
-        4. Fulfill all futures with the selected replica
-
-        This process repeats indefinitely until the task is cancelled.
-        """
-        while self._running:
-            batch = []
-            futs = []
-            sess_ids = []
-
-            # Wait for first request
-            fut, healthy_replicas, sess_id, session_map = await self._queue.get()
-            batch.append((healthy_replicas, sess_id, session_map))
-            futs.append(fut)
-            sess_ids.append(sess_id)
-            start_time = time.monotonic()
-
-            while True:
-                try:
-                    timeout = max(
-                        0, self.batch_max_wait_s - (time.monotonic() - start_time)
-                    )
-                    (
-                        fut,
-                        healthy_replicas,
-                        sess_id,
-                        session_map,
-                    ) = await asyncio.wait_for(
-                        self._queue.get(), timeout
-                    )  # wait for timeout or until self._queue.get() finishes
-                    batch.append((healthy_replicas, sess_id, session_map))
-                    futs.append(fut)
-                    sess_ids.append(sess_id)
-
-                    if len(batch) >= self.batch_max_size:
-                        break
-                except asyncio.TimeoutError:
-                    break
-
-            if self.session_map is not None:
-                session_map = self.session_map
-            else:
-                session_map = batch[-1][2]  # use most recent session map
-            if self.get_healthy_replicas is not None:
-                healthy_replicas = self.get_healthy_replicas()
-            else:
-                healthy_replicas = batch[-1][0]  # use most recent replica state
-                # Check if any replicas have become unhealthy
-                healthy_replicas = [r for r in healthy_replicas if r.healthy]
-
-            # One routing decision for the whole batch
-            replica = await self.inner_router.get_replica(
-                healthy_replicas, None, session_map
-            )
-
-            # Fulfill all futures with the chosen replica
-            for fut in futs:
-                fut.set_result(replica)
-
-    async def get_replica(
-        self,
-        healthy_replicas: List[Replica],
-        sess_id: Optional[str] = None,
-        session_map: Optional[Dict[str, int]] = None,
-    ) -> Replica:
-        """Enqueue request and wait until batch assigns a replica."""
-        fut = asyncio.Future()
-        # Queue the request for batching - this is non-blocking
-        self._queue.put_nowait((fut, healthy_replicas, sess_id, session_map))
-
-        # Wait for the batch processor to resolve our future
-        return await fut
-
-    async def shutdown(self):
-        """Stop the batch loop gracefully."""
-        self._running = False
-        self._batch_task.cancel()
-        try:
-            await self._batch_task
-        except asyncio.CancelledError:
-            pass
diff --git a/src/forge/controller/service/service.py b/src/forge/controller/service/service.py
@@ -35,6 +35,7 @@
 import asyncio
 import logging
 import pprint
+import time
 import uuid
 from typing import Dict, List
 
@@ -110,6 +111,13 @@ async def __initialize__(self):
         self._default_router = RoundRobinRouter()
         self._session_router = SessionRouter(fallback_router=LeastLoadedRouter())
 
+        # Batching
+        self._max_batch_size = self._cfg.max_batch_size
+        self._batch_max_wait_s = self._cfg.batch_max_wait_s
+        self._batch_task: asyncio.Task | None = None
+        self._running_batch_loop = False
+        self._batch_queue: asyncio.Queue = asyncio.Queue()
+
         # Initialize all replicas
         replicas = []
         num_replicas = self._cfg.num_replicas
@@ -138,6 +146,60 @@ async def __initialize__(self):
             self._health_loop(poll_rate_s=self._cfg.health_poll_rate)
         )
 
+        # Start batch loop if batching enabled
+        if self._max_batch_size > 1:
+            self._running_batch_loop = True
+            self._batch_task = asyncio.create_task(self._batch_loop())
+
+    async def _batch_loop(self):
+        """Background task that continuously processes batches of routing requests.
+
+        This is the core batching logic that runs in a separate asyncio task.
+        It collects requests from the queue and processes them in batches based
+        on size and time constraints.
+
+        The loop follows these steps:
+        1. Wait for the first request to start a new batch
+        2. Collect additional requests until batch_max_size or batch_max_wait_s is reached
+        3. Make a single routing decision for the entire batch
+        4. Fulfill all futures with the selected replica
+
+        This process repeats indefinitely until the task is cancelled.
+        """
+        while self._running_batch_loop:
+            batch_futs = []
+
+            # Wait for first request
+            fut = await self._batch_queue.get()
+            batch_futs.append(fut)
+            start_time = time.monotonic()
+
+            while True:
+                try:
+                    timeout = max(
+                        0, self._batch_max_wait_s - (time.monotonic() - start_time)
+                    )
+                    fut = await asyncio.wait_for(
+                        self._batch_queue.get(), timeout
+                    )  # wait for timeout or until self._queue.get() finishes
+                    batch_futs.append(fut)
+
+                    if len(batch_futs) >= self._max_batch_size:
+                        break
+                except asyncio.TimeoutError:
+                    break
+
+            healthy_replicas = self._get_healthy_replicas()
+
+            # One routing decision for the whole batch
+            replica = self._default_router.get_replica(
+                healthy_replicas, None, self._session_replica_map
+            )
+
+            # Fulfill all futures with the chosen replica
+            for fut in batch_futs:
+                fut.set_result(replica)
+
     async def _call(self, sess_id: str | None, function: str, *args, **kwargs):
         """
         Routes a function call to the appropriate replica with load balancing and fault tolerance.
@@ -211,7 +273,7 @@ async def call_all(self, function: str, *args, **kwargs) -> List:
         Raises:
             RuntimeError: If no healthy replicas are available
         """
-        healthy_replicas = [r for r in self._replicas if r.healthy]
+        healthy_replicas = self._get_healthy_replicas()
 
         if not healthy_replicas:
             raise RuntimeError("No healthy replicas available for broadcast call")
@@ -280,9 +342,7 @@ async def _migrate_remaining_requests(self, failed_replica: Replica):
         )
 
         # Find healthy replicas
-        healthy_replicas = [
-            r for r in self._replicas if r.healthy and r != failed_replica
-        ]
+        healthy_replicas = self._get_healthy_replicas()
 
         if not healthy_replicas:
             # No healthy replicas, fail all requests
@@ -334,7 +394,7 @@ def _update_service_metrics(self):
         """Updates service-level metrics."""
         self._metrics.total_sessions = len(self._active_sessions)
         self._metrics.total_replicas = len(self._replicas)
-        self._metrics.healthy_replicas = sum(1 for r in self._replicas if r.healthy)
+        self._metrics.healthy_replicas = len(self._get_healthy_replicas())
         # Store direct references to replica metrics for aggregation
         self._metrics.replica_metrics = {}
         for replica in self._replicas:
@@ -446,6 +506,10 @@ async def terminate_session(self, sess_id: str):
         # Update metrics
         self._update_service_metrics()
 
+    def _get_healthy_replicas(self) -> list[Replica]:
+        """Returns a list of healthy replicas."""
+        return [r for r in self._replicas if r.healthy]
+
     async def _health_loop(self, poll_rate_s: float):
         """Runs the health loop to monitor and recover replicas.
 
@@ -476,14 +540,24 @@ async def _health_loop(self, poll_rate_s: float):
 
     async def _get_replica(self, sess_id: str | None) -> "Replica":
         """Get a replica for the given session ID."""
-        healthy_replicas = [r for r in self._replicas if r.healthy]
-        if sess_id is None:
-            # No session, use the default router
-            return self._default_router.get_replica(healthy_replicas)
 
-        return self._session_router.get_replica(
-            healthy_replicas, sess_id, self._session_replica_map
-        )
+        if sess_id:
+            # Stateful routing always uses session router
+            healthy_replicas = self._get_healthy_replicas()
+            return self._session_router.get_replica(
+                healthy_replicas, sess_id, self._session_replica_map
+            )
+
+        # Stateless: batching
+        if self._max_batch_size > 1:
+            fut = asyncio.Future()
+            healthy_replicas = self._get_healthy_replicas()
+            self._batch_queue.put_nowait(fut)
+            return await fut
+        else:
+            # No batching, pick immediately
+            healthy_replicas = self._get_healthy_replicas()
+            return self._default_router.get_replica(healthy_replicas)
 
     async def stop(self):
         logger.debug("Stopping service...")
@@ -582,7 +656,7 @@ async def _get_internal_state(self) -> dict:
             # Load balancing state
             # Service-level state
             "total_replicas": len(self._replicas),
-            "healthy_replica_count": sum(1 for r in self._replicas if r.healthy),
+            "healthy_replica_count": len(self._get_healthy_replicas()),
             "shutdown_requested": self._shutdown_requested,
             # Metrics summary
             "total_sessions": len(self._active_sessions),
diff --git a/src/forge/types.py b/src/forge/types.py
@@ -118,6 +118,8 @@ class ServiceConfig:
     health_poll_rate: float = 0.2
     replica_max_concurrent_requests: int = 10
     return_first_rank_result: bool = True
+    max_batch_size: int = 1
+    batch_max_wait_s: float = 0.01
 
     def to_process_config(self) -> ProcessConfig:
         """Extract ProcessConfig from this ServiceConfig.
diff --git a/tests/unit_tests/test_service.py b/tests/unit_tests/test_service.py