Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ mkdir -p $EC_SHARED_STORAGE_PATH
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
CUDA_VISIBLE_DEVICES="$GPU_E" \
VLLM_DEBUG_DUMP_PATH=$LOG_PATH \
VLLM_NIXL_EC_SIDE_CHANNEL_PORT=5569 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
--port "$ENCODE_PORT" \
--enforce-eager \
Expand All @@ -102,11 +105,8 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
"ec_connector": "NixlECConnector",
"ec_role": "ec_producer"
}' \
>"${ENC_LOG}" 2>&1 &

Expand All @@ -116,8 +116,10 @@ PIDS+=($!)
# Prefill worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_P" \
VLLM_DEBUG_DUMP_PATH=$LOG_PATH \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=5559 \
VLLM_NIXL_EC_SIDE_CHANNEL_PORT=5579 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_PORT" \
Expand All @@ -126,11 +128,8 @@ vllm serve "$MODEL" \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
"ec_connector": "NixlECConnector",
"ec_role": "ec_consumer"
}' \
--kv-transfer-config '{
"kv_connector": "NixlConnector",
Expand All @@ -144,6 +143,7 @@ PIDS+=($!)
# Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_D" \
VLLM_DEBUG_DUMP_PATH=$LOG_PATH \
UCX_NET_DEVICES=all \
VLLM_NIXL_SIDE_CHANNEL_PORT=6000 \
vllm serve "$MODEL" \
Expand Down
104 changes: 54 additions & 50 deletions examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
PROXY_PORT="${PROXY_PORT:-10001}"

GPU_E="${GPU_E:-0}"
GPU_E="${GPU_E:-2}"
GPU_PD="${GPU_PD:-1}"

EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
Expand Down Expand Up @@ -86,7 +86,10 @@ mkdir -p $EC_SHARED_STORAGE_PATH
###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
CUDA_VISIBLE_DEVICES="$GPU_E" \
VLLM_DEBUG_DUMP_PATH=$LOG_PATH/dump \
VLLM_NIXL_EC_SIDE_CHANNEL_PORT=5569 \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
--port "$ENCODE_PORT" \
--enforce-eager \
Expand All @@ -96,11 +99,8 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
"ec_connector": "NixlECConnector",
"ec_role": "ec_producer"
}' \
>"${ENC_LOG}" 2>&1 &

Expand All @@ -109,19 +109,19 @@ PIDS+=($!)
###############################################################################
# Prefill+Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
VLLM_NIXL_EC_SIDE_CHANNEL_PORT=5579 \
VLLM_DEBUG_DUMP_PATH=$LOG_PATH/dump \
CUDA_VISIBLE_DEVICES="$GPU_PD" \
vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
"ec_connector": "NixlECConnector",
"ec_role": "ec_consumer"
}' \
>"${PD_LOG}" 2>&1 &

Expand All @@ -147,40 +147,44 @@ PIDS+=($!)
wait_for_server $PROXY_PORT
echo "All services are up!"

###############################################################################
# Benchmark
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--port $PROXY_PORT

PIDS+=($!)

###############################################################################
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
{"type": "text", "text": "What is in this image?"}
]}
]
}'


# cleanup
echo "cleanup..."
cleanup
# ###############################################################################
# # Benchmark
# ###############################################################################
# echo "Running benchmark (stream)..."
# vllm bench serve \
# --model $MODEL \
# --backend openai-chat \
# --endpoint /v1/chat/completions \
# --dataset-name hf \
# --dataset-path lmarena-ai/VisionArena-Chat \
# --seed 0 \
# --num-prompts $NUM_PROMPTS \
# --save-result \
# --save-detailed \
# --result-dir $LOG_PATH \
# --result-filename ePD_nixl_$(date +"%Y%m%d_%H%M%S").json \
# --port $PROXY_PORT

# PIDS+=($!)

# # ###############################################################################
# # # Single request with local image
# # ###############################################################################
# # echo "Running single request with local image (non-stream)..."
# # curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
# # -H "Content-Type: application/json" \
# # -d '{
# # "model": "'${MODEL}'",
# # "messages": [
# # {"role": "system", "content": "You are a helpful assistant."},
# # {"role": "user", "content": [
# # {"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
# # {"type": "text", "text": "What is in this image?"}
# # ]}
# # ]
# # }'


# # cleanup
# echo "cleanup..."
# cleanup
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
#!/bin/bash
set -euo pipefail

declare -a PIDS=()

###############################################################################
# Configuration -- override via env before running
###############################################################################
MODEL="${MODEL:-Qwen/Qwen2.5-VL-3B-Instruct}"
LOG_PATH="${LOG_PATH:-./logs}"
mkdir -p $LOG_PATH

ENCODE_PORT="${ENCODE_PORT:-19534}"
PREFILL_DECODE_PORT="${PREFILL_DECODE_PORT:-19535}"
PROXY_PORT="${PROXY_PORT:-10001}"

GPU_E="${GPU_E:-2}"
GPU_PD="${GPU_PD:-1}"

EC_SHARED_STORAGE_PATH="${EC_SHARED_STORAGE_PATH:-/tmp/ec_cache}"
TIMEOUT_SECONDS="${TIMEOUT_SECONDS:-12000}" # wait_for_server timeout

NUM_PROMPTS="${NUM_PROMPTS:-100}" # number of prompts to send in benchmark

###############################################################################
# Helpers
###############################################################################
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)

START_TIME=$(date +"%Y%m%d_%H%M%S")
ENC_LOG=$LOG_PATH/encoder_${START_TIME}.log
PD_LOG=$LOG_PATH/pd_${START_TIME}.log
PROXY_LOG=$LOG_PATH/proxy_${START_TIME}.log

wait_for_server() {
local port=$1
timeout "$TIMEOUT_SECONDS" bash -c "
until curl -s localhost:$port/v1/chat/completions > /dev/null; do
sleep 1
done" && return 0 || return 1
}

# Cleanup function
cleanup() {
echo "Stopping everything…"
trap - INT TERM USR1 # prevent re-entrancy

# Kill all tracked PIDs
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Killing process $pid"
kill "$pid" 2>/dev/null
fi
done

# Wait a moment for graceful shutdown
sleep 2

# Force kill any remaining processes
for pid in "${PIDS[@]}"; do
if kill -0 "$pid" 2>/dev/null; then
echo "Force killing process $pid"
kill -9 "$pid" 2>/dev/null
fi
done

# Kill the entire process group as backup
kill -- -$$ 2>/dev/null

echo "All processes stopped."
exit 0
}

trap cleanup INT
trap cleanup USR1
trap cleanup TERM

# clear previous cache
echo "remove previous ec cache folder"
rm -rf $EC_SHARED_STORAGE_PATH

echo "make ec cache folder"
mkdir -p $EC_SHARED_STORAGE_PATH

###############################################################################
# Encoder worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \
--gpu-memory-utilization 0.01 \
--port "$ENCODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--no-enable-prefix-caching \
--max-num-batched-tokens 114688 \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_producer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
>"${ENC_LOG}" 2>&1 &

PIDS+=($!)

###############################################################################
# Prefill+Decode worker
###############################################################################
CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \
--gpu-memory-utilization 0.7 \
--port "$PREFILL_DECODE_PORT" \
--enforce-eager \
--enable-request-id-headers \
--max-num-seqs 128 \
--allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \
--ec-transfer-config '{
"ec_connector": "ECSharedStorageConnector",
"ec_role": "ec_consumer",
"ec_connector_extra_config": {
"shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'"
}
}' \
>"${PD_LOG}" 2>&1 &

PIDS+=($!)

# Wait for workers
wait_for_server $ENCODE_PORT
wait_for_server $PREFILL_DECODE_PORT

###############################################################################
# Proxy
###############################################################################
python disagg_epd_proxy.py \
--host "0.0.0.0" \
--port "$PROXY_PORT" \
--encode-servers-urls "http://localhost:$ENCODE_PORT" \
--prefill-servers-urls "disable" \
--decode-servers-urls "http://localhost:$PREFILL_DECODE_PORT" \
>"${PROXY_LOG}" 2>&1 &

PIDS+=($!)

wait_for_server $PROXY_PORT
echo "All services are up!"

###############################################################################
# Benchmark
###############################################################################
echo "Running benchmark (stream)..."
vllm bench serve \
--model $MODEL \
--backend openai-chat \
--endpoint /v1/chat/completions \
--dataset-name hf \
--dataset-path lmarena-ai/VisionArena-Chat \
--seed 0 \
--num-prompts $NUM_PROMPTS \
--save-result \
--save-detailed \
--result-dir $LOG_PATH \
--result-filename ePD_nixl_shared_$(date +"%Y%m%d_%H%M%S").json \
--port $PROXY_PORT

PIDS+=($!)

###############################################################################
# Single request with local image
###############################################################################
echo "Running single request with local image (non-stream)..."
curl http://127.0.0.1:${PROXY_PORT}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "'${MODEL}'",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "file://'"${GIT_ROOT}"'/tests/v1/ec_connector/integration/hato.jpg"}},
{"type": "text", "text": "What is in this image?"}
]}
]
}'


# cleanup
echo "cleanup..."
cleanup
Loading