inclusionAI
diff --git a/‎areal/experimental/inference_service/controller/controller.py‎
Lines changed: 112 additions & 60 deletions b/‎areal/experimental/inference_service/controller/controller.py‎
Lines changed: 112 additions & 60 deletions
@@ -23,6 +23,7 @@
 from threading import Lock
 from typing import TYPE_CHECKING, Any, cast
 
+import httpx
 from openai.types.chat import ChatCompletion, ChatCompletionChunk
 
 if TYPE_CHECKING:
@@ -168,6 +169,11 @@ def __init__(
         # Each entry: (guard_addr, role, worker_index) for /kill_forked_worker.
         self._forked_services: list[tuple[str, str, int]] = []
 
+        # Shared HTTP clients
+        self._sync_client = httpx.Client(timeout=30.0)
+        self._async_client = httpx.AsyncClient(timeout=config.request_timeout)
+        self._destroyed = False
+
         # Proxy compatibility (no-ops — gateway IS the proxy)
         self._proxy_started = False
         self.proxy_workers: list = []
@@ -259,8 +265,6 @@ async def _async_initialize(
         """
         from dataclasses import asdict
 
-        import requests
-
         from areal.api.cli_args import SchedulingSpec, SchedulingStrategy
         from areal.api.scheduler_api import Job
 
@@ -431,10 +435,9 @@ def _build_launch_cmd(
                 # Allocate rendezvous port on head node for distributed init
                 dist_init_addr = None
                 if nnodes_per_instance > 1:
-                    resp = requests.post(
+                    resp = self._sync_client.post(
                         f"{head_guard_addr}/alloc_ports",
                         json={"count": 1},
-                        timeout=30,
                     )
                     resp.raise_for_status()
                     rendezvous_data = resp.json()
@@ -449,10 +452,9 @@ def _build_launch_cmd(
                     guard_addr = f"http://{format_hostport(worker.ip, int(worker.worker_ports[0]))}"
 
                     # Allocate port for inference server on this node
-                    resp = requests.post(
+                    resp = self._sync_client.post(
                         f"{guard_addr}/alloc_ports",
                         json={"count": 1},
-                        timeout=30,
                     )
                     resp.raise_for_status()
                     port_data = resp.json()
@@ -494,10 +496,9 @@ def _build_launch_cmd(
                             "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
                         }
 
-                    resp = requests.post(
+                    resp = self._sync_client.post(
                         f"{guard_addr}/fork",
                         json=fork_payload,
-                        timeout=30,
                     )
                     resp.raise_for_status()
                     self._forked_services.append(
@@ -659,41 +660,54 @@ def _wait_for_service(
         self, url: str, name: str, timeout: float | None = None
     ) -> None:
         """Wait for a service to become healthy."""
-        import requests
-
         timeout = timeout or self.config.setup_timeout
         deadline = time.monotonic() + timeout
         while time.monotonic() < deadline:
             try:
-                resp = requests.get(url, timeout=2)
+                resp = self._sync_client.get(url, timeout=2)
                 if resp.status_code == 200:
                     logger.info("%s is ready at %s", name, url)
                     return
-            except requests.RequestException:
+            except httpx.HTTPError:
                 pass
             time.sleep(0.1)
         raise TimeoutError(f"{name} did not become healthy at {url} within {timeout}s")
 
     def _register_data_proxies_in_router(self) -> None:
         """Register all data proxy workers in the router and store their worker IDs."""
-        import requests
+        if not self._data_proxy_addrs:
+            return
 
-        for data_proxy_addr in self._data_proxy_addrs:
-            resp = requests.post(
-                f"{self._router_addr}/register",
-                json={"worker_addr": data_proxy_addr},
-                headers={"Authorization": f"Bearer {self.config.admin_api_key}"},
-                timeout=5,
-            )
+        from concurrent.futures import ThreadPoolExecutor
+
+        admin_key = self.config.admin_api_key
+        router_addr = self._router_addr
+
+        def _register_one(data_proxy_addr: str) -> tuple[str, str | None]:
+            # Each thread gets its own httpx.Client because httpx.Client
+            # is not thread-safe and must not be shared across threads.
+            with httpx.Client() as client:
+                resp = client.post(
+                    f"{router_addr}/register",
+                    json={"worker_addr": data_proxy_addr},
+                    headers={"Authorization": f"Bearer {admin_key}"},
+                    timeout=5,
+                )
             resp.raise_for_status()
             worker_id = resp.json().get("worker_id")
-            if worker_id:
-                self._worker_ids[data_proxy_addr] = worker_id
             logger.info(
                 "Registered data proxy %s in router (worker_id=%s)",
                 data_proxy_addr,
                 worker_id,
             )
+            return data_proxy_addr, worker_id
+
+        with ThreadPoolExecutor(max_workers=len(self._data_proxy_addrs)) as pool:
+            results = list(pool.map(_register_one, self._data_proxy_addrs))
+
+        for data_proxy_addr, worker_id in results:
+            if worker_id:
+                self._worker_ids[data_proxy_addr] = worker_id
 
     def register_model(
         self,
@@ -702,11 +716,9 @@ def register_model(
         api_key: str | None = None,
         data_proxy_addrs: list[str] | None = None,
     ) -> None:
-        import requests
-
         if data_proxy_addrs is None:
             data_proxy_addrs = self._data_proxy_addrs
-        resp = requests.post(
+        resp = self._sync_client.post(
             f"{self._gateway_addr}/register_model",
             json={
                 "model": model,
@@ -872,6 +884,9 @@ async def _handle_online_ready_callback(
 
     def destroy(self) -> None:
         """Tear down all services and release resources."""
+        if self._destroyed:
+            return
+        self._destroyed = True
         self._stop_online_callback_server()
 
         # Destroy workflow executor
@@ -893,6 +908,16 @@ def destroy(self) -> None:
                 )
         self._forked_services.clear()
 
+        # Close shared HTTP clients after all kill requests have been sent
+        self._sync_client.close()
+        try:
+            from areal.infra.utils.concurrent import run_async_task
+
+            run_async_task(self._async_client.aclose)
+        except Exception:
+            # Best-effort cleanup on the destroy path.
+            pass
+
         # RPCGuard's shutdown `finally` block automatically kills all
         # forked children, so explicit teardown above is best-effort.
         # Delete all RPCGuard workers via scheduler
@@ -939,8 +964,20 @@ def set_version(self, version: int) -> None:
 
     async def _async_set_version(self, version: int) -> None:
         payload = {"version": version}
-        for wid in self._worker_ids.values():
-            await self._async_gateway_http_post(f"/set_version/{wid}", payload)
+        results = await asyncio.gather(
+            *[
+                self._async_gateway_http_post(f"/set_version/{wid}", payload)
+                for wid in self._worker_ids.values()
+            ],
+            return_exceptions=True,
+        )
+        failed = [r for r in results if isinstance(r, Exception)]
+        for r in failed:
+            logger.error("Failed to set version on a worker: %s", r)
+        if failed and len(failed) == len(results):
+            raise RuntimeError(
+                f"set_version({version}) failed on ALL {len(failed)} workers"
+            )
 
     def get_version(self) -> int:
         """Return the local version (compatible with VersionProvider protocol)."""
@@ -1196,8 +1233,9 @@ async def chat_completion(
             return self._stream_chat_completion(url, body, headers)
 
         # Non-streaming path
-        timeout = aiohttp.ClientTimeout(total=self.config.request_timeout)
-        async with aiohttp.ClientSession(timeout=timeout) as session:
+        async with aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(total=self.config.request_timeout)
+        ) as session:
             async with session.post(url, json=body, headers=headers) as resp:
                 if resp.status != 200:
                     text = await resp.text()
@@ -1217,8 +1255,9 @@ async def _stream_chat_completion(
         """Parse SSE stream from the gateway into ChatCompletionChunk objects."""
         import aiohttp
 
-        timeout = aiohttp.ClientTimeout(total=self.config.request_timeout)
-        session = aiohttp.ClientSession(timeout=timeout)
+        session = aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(total=self.config.request_timeout)
+        )
         try:
             resp = await session.post(url, json=body, headers=headers)
             if resp.status != 200:
@@ -1270,8 +1309,20 @@ async def pause_generation(self, worker_id: str | None = None) -> None:
         if worker_id is not None:
             await self._async_gateway_http_post(f"/pause_generation/{worker_id}", {})
         else:
-            for wid in self._worker_ids.values():
-                await self._async_gateway_http_post(f"/pause_generation/{wid}", {})
+            results = await asyncio.gather(
+                *[
+                    self._async_gateway_http_post(f"/pause_generation/{wid}", {})
+                    for wid in self._worker_ids.values()
+                ],
+                return_exceptions=True,
+            )
+            failed = [r for r in results if isinstance(r, Exception)]
+            for r in failed:
+                logger.error("Failed to pause generation on a worker: %s", r)
+            if failed and len(failed) == len(results):
+                raise RuntimeError(
+                    f"pause_generation failed on ALL {len(failed)} workers"
+                )
 
     async def continue_generation(self, worker_id: str | None = None) -> None:
         """Continue generation on a specific worker, or all workers if worker_id is None."""
@@ -1280,8 +1331,20 @@ async def continue_generation(self, worker_id: str | None = None) -> None:
         if worker_id is not None:
             await self._async_gateway_http_post(f"/continue_generation/{worker_id}", {})
         else:
-            for wid in self._worker_ids.values():
-                await self._async_gateway_http_post(f"/continue_generation/{wid}", {})
+            results = await asyncio.gather(
+                *[
+                    self._async_gateway_http_post(f"/continue_generation/{wid}", {})
+                    for wid in self._worker_ids.values()
+                ],
+                return_exceptions=True,
+            )
+            failed = [r for r in results if isinstance(r, Exception)]
+            for r in failed:
+                logger.error("Failed to continue generation on a worker: %s", r)
+            if failed and len(failed) == len(results):
+                raise RuntimeError(
+                    f"continue_generation failed on ALL {len(failed)} workers"
+                )
 
     # -- Stats -------------------------------------------------------------
 
@@ -1505,12 +1568,9 @@ def _fork_on_guard(
         Returns ``(host, port)`` of the forked service and records the entry
         in ``_forked_services`` for cleanup.
         """
-        import requests
-
-        resp = requests.post(
+        resp = self._sync_client.post(
             f"{guard_addr}/alloc_ports",
             json={"count": 1},
-            timeout=30,
         )
         resp.raise_for_status()
         port_data = resp.json()
@@ -1519,14 +1579,13 @@ def _fork_on_guard(
 
         cmd = list(raw_cmd) + ["--host", host, "--port", str(port)]
 
-        resp = requests.post(
+        resp = self._sync_client.post(
             f"{guard_addr}/fork",
             json={
                 "role": role,
                 "worker_index": worker_index,
                 "raw_cmd": cmd,
             },
-            timeout=30,
         )
         resp.raise_for_status()
 
@@ -1540,10 +1599,8 @@ def _fork_on_guard(
     def _kill_forked_service(
         self, guard_addr: str, role: str, worker_index: int
     ) -> None:
-        import requests
-
         try:
-            resp = requests.post(
+            resp = self._sync_client.post(
                 f"{guard_addr}/kill_forked_worker",
                 json={"role": role, "worker_index": worker_index},
                 timeout=10,
@@ -1557,7 +1614,7 @@ def _kill_forked_service(
                     worker_index,
                     resp.text,
                 )
-        except requests.RequestException as exc:
+        except httpx.HTTPError as exc:
             logger.error(
                 "Error killing forked service %s/%d: %s", role, worker_index, exc
             )
@@ -1571,11 +1628,9 @@ def _gateway_http_post(self, endpoint: str, payload: dict[str, Any]) -> None:
         Raises ``RuntimeError`` on HTTP errors or connection failures so that
         callers (e.g. ``pause()`` / ``resume()``) can detect and handle them.
         """
-        import requests
-
         url = f"{self._gateway_addr}{endpoint}"
         try:
-            resp = requests.post(
+            resp = self._sync_client.post(
                 url,
                 json=payload,
                 headers={"Authorization": f"Bearer {self.config.admin_api_key}"},
@@ -1585,7 +1640,7 @@ def _gateway_http_post(self, endpoint: str, payload: dict[str, Any]) -> None:
                 raise RuntimeError(
                     f"Gateway {endpoint} returned {resp.status_code}: {resp.text}"
                 )
-        except requests.RequestException as exc:
+        except httpx.HTTPError as exc:
             raise RuntimeError(f"Failed to POST {endpoint}: {exc}") from exc
 
     async def _async_gateway_http_post(
@@ -1597,19 +1652,16 @@ async def _async_gateway_http_post(
         callers (e.g. ``pause_generation()`` / ``continue_generation()``) can
         detect and handle them.
         """
-        import httpx
-
         url = f"{self._gateway_addr}{endpoint}"
         try:
-            async with httpx.AsyncClient(timeout=self.config.request_timeout) as client:
-                resp = await client.post(
-                    url,
-                    json=payload,
-                    headers={"Authorization": f"Bearer {self.config.admin_api_key}"},
+            resp = await self._async_client.post(
+                url,
+                json=payload,
+                headers={"Authorization": f"Bearer {self.config.admin_api_key}"},
+            )
+            if resp.status_code >= 400:
+                raise RuntimeError(
+                    f"Gateway {endpoint} returned {resp.status_code}: {resp.text}"
                 )
-                if resp.status_code >= 400:
-                    raise RuntimeError(
-                        f"Gateway {endpoint} returned {resp.status_code}: {resp.text}"
-                    )
         except httpx.HTTPError as exc:
             raise RuntimeError(f"Failed to POST {endpoint}: {exc}") from exc