Skip to content

Commit 15b19e0

Browse files
Merge branch 'main' into rust_dl_apis_serialization
2 parents fc423b1 + 8879d65 commit 15b19e0

40 files changed

+5264
-9
lines changed

.github/workflows/clang-format.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
FILE_PATTERN='\.(cpp|h|cc|c|cxx|hpp|cu|cuh)$'
1919
2020
echo "### Modified C/C++ files:"
21-
FILES=$(git diff --name-only HEAD^1 HEAD | grep -E "$FILE_PATTERN") || true
21+
FILES=$(git diff --name-only HEAD^1 HEAD -- . ':(exclude)examples/device/ep' | grep -E "$FILE_PATTERN") || true
2222
[ -z "$FILES" ] && echo "(none)" || echo "$FILES"
2323
2424
echo "### clang format errors:"

.github/workflows/copyright-check.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ for f in $(git ls-files); do
2121
*.png|*.jpg|*.jpeg|*.gif|*.ico|*.zip|*.rst|*.pyc|*.lock|*.md|*.svg|*.wrap|*.in|*.json|*.template|*.gitignore|*.python-version|*py.typed)
2222
continue
2323
;;
24-
CODEOWNERS|LICENSE|Doxyfile|.clang-format|.clang-tidy|.codespellrc)
24+
CODEOWNERS|*LICENSE*|Doxyfile|.clang-format|.clang-tidy|.codespellrc)
2525
continue
2626
;;
2727
esac
@@ -39,7 +39,7 @@ for f in $(git ls-files); do
3939

4040
# Extract copyright years (handles YYYY or YYYY-YYYY)
4141
copyright_years=$(echo "$header" | \
42-
grep -Eo 'Copyright \(c\) [0-9]{4}(-[0-9]{4})?' | \
42+
grep NVIDIA | grep -Eo 'Copyright \(c\) [0-9]{4}(-[0-9]{4})?' | \
4343
sed -E 's/.* ([0-9]{4})(-[0-9]{4})?/\1\2/' || true)
4444

4545
if [[ -z "$copyright_years" ]]; then
@@ -57,7 +57,7 @@ for f in $(git ls-files); do
5757
fi
5858

5959
# License line must exist
60-
if ! echo "$header" | grep -Eq '^[[:space:]]*(#|//|\*|/\*|<!--)[[:space:]]*SPDX-License-Identifier:[[:space:]]*Apache-2\.0'; then
60+
if ! echo "$header" | grep -Eq '^[[:space:]]*(#|//|\*|/\*|<!--)[[:space:]]*SPDX-License-Identifier:.*Apache-2\.0'; then
6161
failures+=("$f (missing license)")
6262
continue
6363
fi

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ repos:
3232
hooks:
3333
- id: black
3434
types_or: [python, cython]
35+
exclude: ^examples/device/ep/
3536

3637
- repo: https://github.com/PyCQA/flake8
3738
rev: 7.1.2

CONTRIBUTING.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ All code must adhere to the style guide in `docs/CodeStyle.md` and be formatted
167167

168168
## Contributing Process
169169

170+
Contributions to the code under `./examples/device/ep` (which is derived
171+
from DeepEP and licensed under the MIT License) must be licensed under Apache
172+
2.0.
173+
170174
Contributions that fix documentation errors or that make small changes
171175
to existing code can be contributed directly by following the rules
172176
below and submitting an appropriate PR.

LICENSE

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
NOTICE: The code under ./examples/device/ep is derived from DeepEP
2+
(originally developed by DeepSeek) and has been modified for use in this
3+
project. The original code was obtained from:
4+
https://github.com/deepseek-ai/DeepEP (commit f0d34aabcb7bdcb3a05d022e7d11b3bf4ccf8ee8)
5+
This code is licensed under the MIT License. The full text of the MIT
6+
License can be found in ./examples/device/ep/LICENSE-DeepEP.
7+
The rest of this codebase is licensed under the Apache License 2.0 as
8+
described below.
9+
110
Apache License
211
Version 2.0, January 2004
312
http://www.apache.org/licenses/

examples/device/ep/LICENSE-DeepEP

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 DeepSeek
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

examples/device/ep/README.md

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# NIXL EP: Expert-Parallel Communication Example
2+
3+
## Overview
4+
5+
NIXL EP is a complete example implementation of expert-parallel communication for Mixture of Experts (MoE) models built on top of [NIXL](https://github.com/ai-dynamo/nixl)'s device API. It provides elastic scaling capabilities, enabling dynamic addition and removal of processes (ranks) during runtime without disrupting existing connections, and leverages NIXL's RDMA and NVLink support for optimal performance.
6+
7+
## Features
8+
- **Dispatch and Combine support**: Supports dispatch and combine operations for MoE inference
9+
- **RDMA and NVLink support**: Utilizes NIXL's abstractions to support both RDMA and NVLink transports for optimal performance
10+
- **Elastic Scaling**: Dynamically add or remove ranks during runtime
11+
12+
## Buffer Initialization
13+
14+
NIXL EP provides a flexible buffer initialization pattern that supports dynamic rank management:
15+
16+
```python
17+
import nixl_ep
18+
19+
# Initialize buffer with dynamic rank support
20+
buffer = nixl_ep.Buffer(rank, explicitly_destroy=True)
21+
buffer.update_memory_buffers(num_ranks, num_experts_per_rank, rdma_bytes)
22+
buffer.connect_ranks(initial_ranks)
23+
24+
# Dispatch & Combine calls
25+
buffer.dispatch(...)
26+
buffer.combine(...)
27+
28+
# Later: Connect new ranks dynamically
29+
buffer.connect_ranks(ranks)
30+
31+
# Dispatch & Combine calls
32+
buffer.dispatch(...)
33+
buffer.combine(...)
34+
35+
# Disconnect ranks when scaling down
36+
buffer.disconnect_ranks(ranks)
37+
```
38+
39+
## Key APIs
40+
41+
- `Buffer(rank_id, nvlink_backend, explicitly_destroy)`: Initialize the NIXL communication buffer
42+
- `update_memory_buffers(num_ranks, num_experts_per_rank, num_rdma_bytes)`: Prepare buffers for up to `num_ranks` ranks and `num_experts_per_rank` experts
43+
- `connect_ranks(remote_ranks)`: Establish NIXL connections to new peers (can be called multiple times)
44+
- `disconnect_ranks(remote_ranks)`: Clean up connections to departing peers
45+
46+
## Testing
47+
48+
The elastic test suite in `tests/elastic/` validates dynamic scaling capabilities:
49+
- Plan files define scaling phases (representing an orchestrator)
50+
- Tests validate correctness and measure bandwidth between scaling phases
51+
52+
**Example Plan** (`expansion_contraction.json`):
53+
```json
54+
[
55+
[0, 1, 2, 3],
56+
[0, 1, 2, 3, 4, 5, 6, 7],
57+
[0, 1, 2, 3, 4, 5]
58+
]
59+
```
60+
This plan defines three phases:
61+
- **Phase 0**: Initial state with ranks 0-3
62+
- **Phase 1**: Ranks 4-7 are added dynamically (launched independently from initial ranks)
63+
- **Phase 2**: Ranks 6-7 are removed dynamically
64+
65+
## Getting Started
66+
67+
#### Build NIXL with NIXL EP:
68+
69+
First, configure the pkg-config paths (only needed when dependencies are installed to non-default paths)
70+
71+
```bash
72+
export PKG_CONFIG_PATH=<path to rdma-core install>/lib/pkgconfig:$PKG_CONFIG_PATH
73+
export PKG_CONFIG_PATH=<path to UCX install>/lib/pkgconfig:$PKG_CONFIG_PATH
74+
export PKG_CONFIG_PATH=<path to DOCA install>/lib/x86_64-linux-gnu/pkgconfig:$PKG_CONFIG_PATH
75+
```
76+
77+
Then, configure the NIXL plugin directory so it can find UCX plugin, and set the LD_LIBRARY_PATH so UCX can find rdma-core:
78+
```bash
79+
export NIXL_PLUGIN_DIR=<path to NIXL install directory>/lib/x86_64-linux-gnu/plugins
80+
export LD_LIBRARY_PATH=<path to rdma-core install>/lib:$LD_LIBRARY_PATH
81+
```
82+
83+
Build and install:
84+
85+
```bash
86+
meson setup build \
87+
-Ducx_path=<path to UCX install> \
88+
-Dprefix=<path to NIXL install directory> \
89+
-Dbuildtype=release \
90+
-Dbuild_nixl_ep=true
91+
92+
cd build
93+
ninja install
94+
```
95+
96+
97+
Finally, configure PYTHONPATH to use NIXL EP:
98+
```bash
99+
export PYTHONPATH=<path to NIXL build directory>/examples/device/ep
100+
```
101+
102+
Refer to [tests/elastic/README.md](tests/elastic/README.md) for detailed instructions on how to run the elastic test suite.

examples/device/ep/csrc/config.hpp

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
3+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
*
5+
* This file incorporates material from the DeepSeek project, licensed under the MIT License.
6+
* The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
7+
*
8+
* SPDX-License-Identifier: MIT AND Apache-2.0
9+
*
10+
* Licensed under the Apache License, Version 2.0 (the "License");
11+
* you may not use this file except in compliance with the License.
12+
* You may obtain a copy of the License at
13+
*
14+
* http://www.apache.org/licenses/LICENSE-2.0
15+
*
16+
* Unless required by applicable law or agreed to in writing, software
17+
* distributed under the License is distributed on an "AS IS" BASIS,
18+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19+
* See the License for the specific language governing permissions and
20+
* limitations under the License.
21+
*/
22+
23+
#pragma once
24+
25+
#include "kernels/api.cuh"
26+
#include "kernels/exception.cuh"
27+
28+
namespace nixl_ep {
29+
30+
template <typename dtype_t>
31+
dtype_t ceil_div(dtype_t a, dtype_t b) {
32+
return (a + b - 1) / b;
33+
}
34+
35+
template <typename dtype_t>
36+
dtype_t align(dtype_t a, dtype_t b) {
37+
return ceil_div<dtype_t>(a, b) * b;
38+
}
39+
40+
struct EPBuffer {
41+
int num_clean_int = 0;
42+
43+
void* dispatch_rdma_send_buffer = nullptr;
44+
void* dispatch_rdma_recv_data_buffer = nullptr;
45+
int* dispatch_rdma_recv_count_buffer = nullptr;
46+
47+
void* combine_rdma_send_buffer = nullptr;
48+
void* combine_rdma_recv_data_buffer = nullptr;
49+
int* combine_rdma_recv_flag_buffer = nullptr;
50+
51+
void* combine_rdma_send_buffer_data_start = nullptr;
52+
size_t num_bytes_per_combine_msg = 0;
53+
54+
std::pair<int*, int> clean_meta() {
55+
EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
56+
return {dispatch_rdma_recv_count_buffer, num_clean_int};
57+
}
58+
};
59+
60+
struct EPLayout {
61+
size_t total_bytes = 0;
62+
EPBuffer buffers[2];
63+
64+
template <typename out_ptr_t = void*, typename count_ptr_t = uint8_t*, typename in_ptr_t = void*>
65+
out_ptr_t advance(const in_ptr_t& ptr, size_t count) {
66+
return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
67+
}
68+
69+
EPLayout(void* rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
70+
const int num_scales = hidden / 128;
71+
72+
// Dispatch and combine layout:
73+
// - 2 symmetric odd/even send buffer
74+
// - 2 symmetric odd/even receive buffers
75+
// - 2 symmetric odd/even signaling buffers
76+
77+
// Message sizes
78+
// NOTES: you should add a control `int4` for combine messages if you want to do data transformation
79+
// NOTES: `num_scales * sizeof(nv_bfloat162)` means the per-128-channel min/max
80+
EP_HOST_ASSERT(num_scales * sizeof(float) <= hidden);
81+
size_t num_bytes_per_dispatch_msg = sizeof(int4) + std::max(hidden * sizeof(nv_bfloat16), hidden + num_scales * sizeof(float));
82+
size_t num_bytes_per_combine_msg = num_scales * sizeof(nv_bfloat162) + hidden * sizeof(nv_bfloat16);
83+
84+
// Send buffer
85+
size_t dispatch_send_buffer_bytes = num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
86+
size_t combine_send_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
87+
size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
88+
EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
89+
total_bytes += send_buffer_bytes * 2;
90+
91+
// Symmetric receive buffers
92+
// TODO: optimize memory usages
93+
size_t dispatch_recv_data_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
94+
size_t combine_recv_buffer_bytes = num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
95+
size_t recv_buffer_bytes = std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
96+
EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
97+
total_bytes += recv_buffer_bytes * 2;
98+
99+
// Symmetric signaling buffers
100+
size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
101+
size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
102+
size_t signaling_buffer_bytes = std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
103+
size_t signaling_buffer_bytes_aligned = align<size_t>(signaling_buffer_bytes, 128);
104+
total_bytes += signaling_buffer_bytes_aligned * 2;
105+
106+
// Assign pointers
107+
// NOTES: we still leave some space for distinguishing dispatch/combine buffer,
108+
// so you may see some parameters are duplicated
109+
for (int i = 0; i < 2; ++ i) {
110+
buffers[i] = {
111+
static_cast<int>(signaling_buffer_bytes / sizeof(int)),
112+
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
113+
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 + recv_buffer_bytes * i),
114+
advance<int*>(rdma_buffer, signaling_buffer_bytes_aligned * i),
115+
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
116+
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 + recv_buffer_bytes * i),
117+
advance<int*>(rdma_buffer, signaling_buffer_bytes_aligned * i),
118+
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
119+
num_bytes_per_combine_msg
120+
};
121+
}
122+
}
123+
};
124+
125+
size_t get_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden, int num_ranks, int num_experts) {
126+
auto num_bytes = EPLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts).total_bytes;
127+
return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) * NUM_BUFFER_ALIGNMENT_BYTES;
128+
}
129+
130+
} // namespace nixl_ep

examples/device/ep/csrc/event.hpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
3+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
*
5+
* This file incorporates material from the DeepSeek project, licensed under the MIT License.
6+
* The modifications made by NVIDIA are licensed under the Apache License, Version 2.0.
7+
*
8+
* SPDX-License-Identifier: MIT AND Apache-2.0
9+
*
10+
* Licensed under the Apache License, Version 2.0 (the "License");
11+
* you may not use this file except in compliance with the License.
12+
* You may obtain a copy of the License at
13+
*
14+
* http://www.apache.org/licenses/LICENSE-2.0
15+
*
16+
* Unless required by applicable law or agreed to in writing, software
17+
* distributed under the License is distributed on an "AS IS" BASIS,
18+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19+
* See the License for the specific language governing permissions and
20+
* limitations under the License.
21+
*/
22+
23+
#include <ATen/cuda/CUDAContext.h>
24+
#include <memory>
25+
26+
#include "kernels/exception.cuh"
27+
28+
namespace nixl_ep {
29+
30+
struct EventHandle {
31+
std::shared_ptr<torch::Event> event;
32+
33+
EventHandle() {
34+
event = std::make_shared<torch::Event>(torch::kCUDA);
35+
event->record(at::cuda::getCurrentCUDAStream());
36+
}
37+
38+
explicit EventHandle(const at::cuda::CUDAStream& stream) {
39+
event = std::make_shared<torch::Event>(torch::kCUDA);
40+
event->record(stream);
41+
}
42+
43+
EventHandle(const EventHandle& other) = default;
44+
45+
void current_stream_wait() const {
46+
at::cuda::getCurrentCUDAStream().unwrap().wait(*event);
47+
}
48+
};
49+
50+
torch::Event create_event(const at::cuda::CUDAStream &s) {
51+
auto event = torch::Event(torch::kCUDA);
52+
event.record(s);
53+
return event;
54+
}
55+
56+
void stream_wait(const at::cuda::CUDAStream& s_0, const at::cuda::CUDAStream& s_1) {
57+
EP_HOST_ASSERT(s_0.id() != s_1.id());
58+
s_0.unwrap().wait(create_event(s_1));
59+
}
60+
61+
void stream_wait(const at::cuda::CUDAStream& s, const EventHandle& event) {
62+
s.unwrap().wait(*event.event);
63+
}
64+
65+
} // namespace nixl_ep

0 commit comments

Comments
 (0)