Skip to content

Commit bbe7851

Browse files
committed
Merge branch 'main' into epv1_fp8
2 parents 36c7fd3 + adf49e3 commit bbe7851

12 files changed

Lines changed: 776 additions & 121 deletions

File tree

.github/workflows/ci.yml

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
name: mori
2+
on:
3+
push:
4+
workflow_dispatch:
5+
jobs:
6+
build:
7+
name: mori build
8+
runs-on: label-1
9+
steps:
10+
- name: clone source code
11+
run: |
12+
pwd
13+
hostname
14+
id
15+
#export PATH="/home/fizhang/.local/bin:$PATH"
16+
cmake --version
17+
echo "0. PATH: $PATH"
18+
cmake --version
19+
#rm -rf mori
20+
#git clone https://github.com/ROCm/mori.git
21+
cd mori
22+
ls
23+
rm -rf build
24+
#git config --global --add safe.directory /home/fizhang/actions-runner/_work/mori/mori/mori
25+
git checkout ci-new
26+
git pull
27+
#pip install --target /apps/mori-ci/packages -r requirements-build.txt
28+
pip install -r requirements-build.txt
29+
#git submodule update --init --recursive
30+
#pip3 install . --no-build-isolation
31+
USE_IONIC=ON pip3 install . --no-build-isolation
32+
# --target /apps/mori-ci/python-packages
33+
scp -P 2233 -r /root/actions-runner/_work/mori/mori/mori root@smci355-ccs-aus-n08-33:/root/actions-runner/_work/mori/mori/
34+
test:
35+
name: mori test
36+
needs: build
37+
runs-on: label-1
38+
steps:
39+
#- uses: actions/checkout@v4
40+
#- name: Set up Python
41+
#uses: actions/setup-python@v4
42+
#with:
43+
#python-version: '3.11'
44+
#- run: pip install -r requirements.txt
45+
- name: Run tests
46+
run: |
47+
hostname
48+
pwd
49+
ls
50+
cd mori
51+
export PYTHONPATH=/root/actions-runner/_work/mori/mori/mori:$PYTHONPATH
52+
#mpiexec --allow-run-as-root -x MORI_GLOBAL_LOG_LEVEL=TRACE -np 2 ls
53+
echo "1. mori-io test case: test_engine.py"
54+
pytest ./tests/python/io/test_engine.py
55+
echo "2. mori-io test case: test_engine_multi_session_batch.py"
56+
pytest ./tests/python/io/test_engine_multi_session_batch.py
57+
58+
echo "1. mori-ibgda test case: write_gpu"
59+
./build/examples/write_gpu
60+
61+
echo "2. mori-ibgda test case: write_inline_gpu"
62+
./build/examples/write_inline_gpu
63+
64+
echo "3. mori-ibgda test case: send_recv_gpu"
65+
./build/examples/send_recv_gpu
66+
67+
echo "4. mori-ibgda test case: dist_write"
68+
mpiexec --allow-run-as-root -x MORI_GLOBAL_LOG_LEVEL=TRACE -np 2 ./build/examples/dist_write -c 4 -t 256 -q 4
69+
70+
echo "1. mori-shmem test case: test_api.py"
71+
pytest ./tests/python/shmem/test_api.py
72+
73+
echo "2. mori-shmem test case: concurrent_put_thread"
74+
mpiexec --allow-run-as-root -np 2 ./build/examples/concurrent_put_thread
75+
76+
echo "3. mori-shmem test case: concurrent_put_imm_thread"
77+
mpiexec --allow-run-as-root -np 2 ./build/examples/concurrent_put_imm_thread
78+
79+
echo "4. mori-shmem test case: concurrent_put_signal_thread"
80+
mpiexec --allow-run-as-root -np 2 ./build/examples/concurrent_put_signal_thread
81+
82+
echo "1. mori-ep test case: test_dispatch_combine.py"
83+
pytest ./tests/python/ops/test_dispatch_combine.py
84+
85+
echo "2. mori-ep test case: bench_dispatch_combine.py"
86+
python3 ./tests/python/ops/bench_dispatch_combine.py
87+
88+
echo "3. mori-ep test case: bench"
89+
GLOO_SOCKET_IFNAME=enp81s0f1 torchrun --nnodes=1 --node_rank=0 --nproc_per_node=1 --master_addr="smci355-ccs-aus-n08-29" --master_port=2222 examples/ops/dispatch_combine/test_dispatch_combine_internode.py --cmd bench --kernel-type v1 --num-qp 2 --max-tokens 4096
90+
91+
echo "4. mori-ep test case: inter bench"
92+
#export MORI_RDMA_DEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_6,ionic_7
93+
#ssh -p 2233 root@smci355-ccs-aus-n08-33 "hostname"
94+
ssh -p 2233 root@smci355-ccs-aus-n08-33 "GLOO_SOCKET_IFNAME=enp81s0f1 /opt/venv/bin/torchrun --nnodes=2 --node_rank=0 --nproc_per_node=1 --master_addr='smci355-ccs-aus-n08-33' --master_port=1234 /root/actions-runner/_work/mori/mori/mori/examples/ops/dispatch_combine/test_dispatch_combine_internode.py --cmd bench --kernel-type v1 --num-qp 2 --max-tokens 4096 &" &
95+
GLOO_SOCKET_IFNAME=enp81s0f1 torchrun --nnodes=2 --node_rank=1 --nproc_per_node=1 --master_addr="smci355-ccs-aus-n08-33" --master_port=1234 examples/ops/dispatch_combine/test_dispatch_combine_internode.py --cmd bench --kernel-type v1 --num-qp 2 --max-tokens 4096
96+
sleep 1
97+
echo "5. mori-ep test case: inter stress"
98+
ssh -p 2233 root@smci355-ccs-aus-n08-33 "GLOO_SOCKET_IFNAME=enp81s0f1 /opt/venv/bin/torchrun --nnodes=2 --node_rank=0 --nproc_per_node=1 --master_addr='smci355-ccs-aus-n08-33' --master_port=1234 /root/actions-runner/_work/mori/mori/mori/examples/ops/dispatch_combine/test_dispatch_combine_internode.py --cmd stress --kernel-type v1 --num-qp 2 --max-tokens 128 &" &
99+
GLOO_SOCKET_IFNAME=enp81s0f1 torchrun --nnodes=2 --node_rank=1 --nproc_per_node=1 --master_addr="smci355-ccs-aus-n08-33" --master_port=1234 examples/ops/dispatch_combine/test_dispatch_combine_internode.py --cmd stress --kernel-type v1 --num-qp 2 --max-tokens 128
100+

examples/local_rdma_ops/atomic_gpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ void LocalRdmaOps() {
136136
// 2 Create an endpoint
137137
RdmaEndpointConfig config;
138138
config.portId = devicePort.second;
139-
config.gidIdx = 3;
139+
//config.gidIdx = 3;
140140
config.maxMsgsNum = 1024;
141141
config.maxCqeNum = 1024;
142142
config.alignment = 4096;

examples/local_rdma_ops/send_recv_gpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ void LocalRdmaOps() {
174174
// 2 Create an endpoint
175175
RdmaEndpointConfig config;
176176
config.portId = devicePort.second;
177-
config.gidIdx = 3;
177+
//config.gidIdx = 3;
178178
config.maxMsgsNum = 256;
179179
config.maxCqeNum = 256;
180180
config.alignment = 4096;

examples/local_rdma_ops/write_gpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ void LocalRdmaOps() {
149149
// 2 Create an endpoint
150150
RdmaEndpointConfig config;
151151
config.portId = devicePort.second;
152-
config.gidIdx = 3;
152+
//config.gidIdx = 3;
153153
config.maxMsgsNum = 64;
154154
config.maxCqeNum = 256;
155155
config.alignment = 4096;

examples/local_rdma_ops/write_inline_gpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ void LocalRdmaOps() {
129129
// 2 Create an endpoint
130130
RdmaEndpointConfig config;
131131
config.portId = devicePort.second;
132-
config.gidIdx = 3;
132+
//config.gidIdx = 3;
133133
config.maxMsgsNum = 1024;
134134
config.maxCqeNum = 1024;
135135
config.alignment = 4096;

0 commit comments

Comments
 (0)