nixl_ep: Migrate elastic.py to TCPStore

itayalroy · itayalroy · commit b5c35dde6d1f · 2025-12-25T01:48:13.000+02:00
This commit migrates elastic.py to TCPStore-based metadata exchange
instead of ETCD and replaces the custom TCP rank server with a
Torch TCPStore group based implementation, reusing the same group for
both metadata and rank management.

I considered defining an abstract base class to support both implementations
(the custom TCP server and TCPStore), but since get_rank is off the
data path and already very fast (see performance table below), maintaining
both options does not seem necessary- Reviewer feedback is welcome.

Performance comparison, 16 concurrent get_rank calls on 2 nodes * 8 gpus:

Scope	TCPStore Avg (ms)	TCP Avg (ms)	TCPStore StdDev (ms)	TCP StdDev (ms)
Local	0.28	1.55	0.04	0.20
Remote	2.95	1.48	2.05	0.17

Signed-off-by: Itay Alroy &lt;ialroy@nvidia.com&gt;
diff --git a/examples/device/ep/README.md b/examples/device/ep/README.md
@@ -15,9 +15,13 @@ NIXL EP provides a flexible buffer initialization pattern that supports dynamic
 
 ```python
 import nixl_ep
+import store_group
+
+# Create TCPStore for coordination
+tcp_store = store_group.create_client_store(master_addr, port)
 
 # Initialize buffer with dynamic rank support
-buffer = nixl_ep.Buffer(rank, explicitly_destroy=True)
+buffer = nixl_ep.Buffer(rank, explicitly_destroy=True, tcp_store_group=tcp_store)
 buffer.update_memory_buffers(num_ranks, num_experts_per_rank, rdma_bytes)
 buffer.connect_ranks(initial_ranks)
 
diff --git a/examples/device/ep/tests/elastic/README.md b/examples/device/ep/tests/elastic/README.md
@@ -4,8 +4,7 @@
 ```bash
 python3 tests/elastic/elastic.py \
     --plan tests/elastic/single_expansion.json \
-    --num-processes 8 \
-    --etcd-server http://127.0.0.1:2379
+    --num-processes 8
 ```
 
 #### Multi-Node Setup:
@@ -14,16 +13,15 @@ python3 tests/elastic/elastic.py \
 ```bash
 python3 tests/elastic/elastic.py \
     --plan tests/elastic/single_expansion.json \
-    --num-processes 4 \
+    --num-processes 4
 ```
 
 **Node 2** (will join the second phase with additional 4 ranks):
 ```bash
 python3 tests/elastic/elastic.py \
     --plan tests/elastic/single_expansion.json \
     --num-processes 4 \
-    --rank-server $MASTER_IP \
-    --etcd-server http://$MASTER_IP:2379
+    --tcp-store $MASTER_IP
 ```
 
 ### Available Test Plans
diff --git a/examples/device/ep/tests/elastic/elastic.py b/examples/device/ep/tests/elastic/elastic.py
@@ -29,7 +29,8 @@
 from typing import cast
 
 import nixl_ep
-import rank_server
+import rank_manager
+import store_group
 import torch
 from plan import Plan
 
@@ -50,7 +51,7 @@ def handle_sigterm(
     frame,
     buffer: nixl_ep.Buffer,
     plan: Plan,
-    rank_client: rank_server.RankClient,
+    rank_client: rank_manager.RankManager,
 ):
     print(
         f"SIGTERM ({signum}) received for process {os.getpid()}! releasing rank and exiting...",
@@ -438,14 +439,22 @@ def test_barrier():
 
 
 def worker(torch_rank: int, args: argparse.Namespace):
-    rank_client = rank_server.RankClient(
-        args.rank_server if args.rank_server else "127.0.0.1"
+    tcp_store = store_group.create_client_store(
+        master_addr=args.tcp_store if args.tcp_store else "127.0.0.1",
+        port=args.tcp_store_port,
     )
+
+    rank_client = rank_manager.RankManager(tcp_store)
     local_rank, global_rank, last_active_phase = rank_client.get_rank()
+    print(
+        f"Process {torch_rank} -> global_rank={global_rank}, local_rank={local_rank}",
+        flush=True,
+    )
+
     plan = Plan(
         args.plan,
         global_rank,
-        start_phase=last_active_phase if last_active_phase is not None else 0,
+        start_phase=int(last_active_phase) if last_active_phase is not None else 0,
     )
     if plan.current_phase == -1:
         print(
@@ -455,10 +464,6 @@ def worker(torch_rank: int, args: argparse.Namespace):
         return
 
     max_num_ranks = plan.get_max_rank() + 1
-    print(
-        f"Process {torch_rank} -> global_rank={global_rank}, local_rank={local_rank}",
-        flush=True,
-    )
 
     # Initialize torch
     os.environ["CUDA_VISIBLE_DEVICES"] = str(local_rank % 8)
@@ -480,9 +485,6 @@ def worker(torch_rank: int, args: argparse.Namespace):
     tcp_nics = ",ibp154s0,ibp192s0,ibp206s0,ibp220s0,ibp94s0"
     os.environ["UCX_NET_DEVICES"] = f"cuda0-{pxb_nics[local_rank]}:1" + tcp_nics
 
-    # Initialize NIXL
-    os.environ["NIXL_ETCD_ENDPOINTS"] = args.etcd_server
-
     # Initialize nixl_ep buffer
     num_rdma_bytes = nixl_ep.Buffer.get_rdma_size_hint(
         args.num_tokens,
@@ -498,6 +500,7 @@ def worker(torch_rank: int, args: argparse.Namespace):
         nvlink_backend=args.nvlink_backend,
         explicitly_destroy=True,
         enable_shrink=True,
+        tcp_store_group=tcp_store,
     )
     buffer.update_memory_buffers(
         num_ranks=max_num_ranks,
@@ -616,15 +619,15 @@ def main():
     parser.add_argument("--hidden-dim", type=int, default=7168, help="Hidden dimension")
     parser.add_argument("--num-topk", type=int, default=8, help="Number of topk")
     parser.add_argument(
-        "--etcd-server",
+        "--tcp-store",
         type=str,
-        default="http://127.0.0.1:2379",
-        help="ETCD server address for NIXL (default: http://127.0.0.1:2379)",
+        help="External TCPStore address. If not set, a local TCPStore master will be created.",
     )
     parser.add_argument(
-        "--rank-server",
-        type=str,
-        help="Rank server address. If not set, a rank server will be started locally and will be killed after all the workers launched in this run are finished.",
+        "--tcp-store-port",
+        type=int,
+        default=9999,
+        help="TCPStore port (default: 9999)",
     )
     parser.add_argument("--kineto", action="store_true", help="Enable kineto profiling")
     parser.add_argument(
@@ -636,14 +639,15 @@ def main():
 
     args = parser.parse_args()
 
-    rank_server_process = None
-    if not args.rank_server:
-        print("Starting rank server locally", flush=True)
-        rank_server_process = torch.multiprocessing.Process(
-            target=rank_server.start_server, daemon=True
+    # Create TCPStore master if no external TCPStore server is specified
+    master_store = None
+    if not args.tcp_store:
+        master_store = store_group.create_master_store(
+            port=args.tcp_store_port,
+            timeout_sec=365 * 24 * 3600,  # 1 year timeout
         )
-        rank_server_process.start()
-        time.sleep(0.5)
+        rank_manager.init_keys(master_store)
+
     if args.num_processes == 1:
         worker(0, args)
         return
diff --git a/examples/device/ep/tests/elastic/rank_manager.py b/examples/device/ep/tests/elastic/rank_manager.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import time
+from contextlib import contextmanager
+from typing import Iterator, Optional, Tuple
+
+import torch.distributed as dist
+
+_KEY_NEXT_GLOBAL_RANK = "rank_manager/next_global_rank"
+_KEY_RELEASED_RANKS = "rank_manager/released_ranks"
+_KEY_LOCK = "rank_manager/lock"
+
+
+def _host_local_ranks_key(hostname: str) -> str:
+    return f"rank_manager/host/{hostname}/local_ranks"
+
+
+def _rank_context_key(global_rank: int) -> str:
+    return f"rank_manager/rank/{global_rank}/context"
+
+
+def _rank_hostname_key(global_rank: int) -> str:
+    return f"rank_manager/rank/{global_rank}/hostname"
+
+
+def _rank_local_key(global_rank: int) -> str:
+    return f"rank_manager/rank/{global_rank}/local_rank"
+
+
+def init_keys(store: dist.TCPStore) -> None:
+    store.add(_KEY_NEXT_GLOBAL_RANK, 0)
+    store.set(_KEY_RELEASED_RANKS, "[]")
+    store.set(_KEY_LOCK, "0")
+
+
+class RankManager:
+
+    def __init__(self, store: dist.TCPStore):
+        self._global_rank: Optional[int] = None
+        self._hostname = os.uname().nodename
+        self._store = store
+
+    @contextmanager
+    def _lock(self) -> Iterator[None]:
+        my_id = f"{self._hostname}_{os.getpid()}"
+        while True:
+            result = self._store.compare_set(_KEY_LOCK, "0", my_id)
+            if result.decode() == my_id:
+                break
+            if result == b"0":
+                continue
+            time.sleep(0.0001)
+        try:
+            yield
+        finally:
+            self._store.set(_KEY_LOCK, "0")
+
+    def _get_json_list(self, key: str) -> list:
+        return (
+            json.loads(self._store.get(key).decode())
+            if self._store.check([key])
+            else []
+        )
+
+    def _set_json_list(self, key: str, value: list):
+        self._store.set(key, json.dumps(value))
+
+    def get_rank(self) -> Tuple[int, int, Optional[str]]:
+        """Returns (local_rank, global_rank, user_context)."""
+        if self._global_rank is not None:
+            print(
+                f"WARNING: rank already assigned - returning existing rank {self._global_rank}",
+                flush=True,
+            )
+            return 0, self._global_rank, None
+
+        start = time.perf_counter()
+        user_context: Optional[str] = None
+
+        with self._lock():
+            released = self._get_json_list(_KEY_RELEASED_RANKS)
+
+            if released:
+                global_rank = min(released)
+                released.remove(global_rank)
+                self._set_json_list(_KEY_RELEASED_RANKS, released)
+                ctx_data = self._store.get(_rank_context_key(global_rank)).decode()
+                if ctx_data and ctx_data != "None":
+                    user_context = ctx_data
+                self._store.delete_key(_rank_context_key(global_rank))
+            else:
+                global_rank = self._store.add(_KEY_NEXT_GLOBAL_RANK, 1) - 1
+
+            local_ranks_key = _host_local_ranks_key(self._hostname)
+            used_local_ranks = set(self._get_json_list(local_ranks_key))
+            local_rank = 0
+            while local_rank in used_local_ranks:
+                local_rank += 1
+            used_local_ranks.add(local_rank)
+            self._store.multi_set(
+                [
+                    local_ranks_key,
+                    _rank_hostname_key(global_rank),
+                    _rank_local_key(global_rank),
+                ],
+                [json.dumps(list(used_local_ranks)), self._hostname, str(local_rank)],
+            )
+
+        self._global_rank = global_rank
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        print(f"[rank_manager] get_rank took {elapsed_ms:.2f} ms", flush=True)
+        return local_rank, global_rank, user_context
+
+    def release_rank(self, user_context: Optional[str] = None) -> bool:
+        if self._global_rank is None:
+            return False
+
+        global_rank = self._global_rank
+
+        with self._lock():
+            hostname_key = _rank_hostname_key(global_rank)
+            local_key = _rank_local_key(global_rank)
+            values = self._store.multi_get([hostname_key, local_key])
+            hostname = values[0].decode()
+            local_rank = int(values[1].decode())
+
+            local_ranks_key = _host_local_ranks_key(hostname)
+            used_local_ranks = self._get_json_list(local_ranks_key)
+            if local_rank in used_local_ranks:
+                used_local_ranks.remove(local_rank)
+            self._set_json_list(local_ranks_key, used_local_ranks)
+
+            if user_context is not None:
+                self._store.set(_rank_context_key(global_rank), str(user_context))
+
+            self._store.delete_key(hostname_key)
+            self._store.delete_key(local_key)
+
+            released = self._get_json_list(_KEY_RELEASED_RANKS)
+            released.append(global_rank)
+            self._set_json_list(_KEY_RELEASED_RANKS, released)
+
+        self._global_rank = None
+        return True
diff --git a/examples/device/ep/tests/elastic/rank_server.py b/examples/device/ep/tests/elastic/rank_server.py
diff --git a/examples/device/ep/tests/elastic/store_group.py b/examples/device/ep/tests/elastic/store_group.py