Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ lm-eval==0.4.10
loguru
compressed-tensors==0.12.2
hf_transfer
transformers==4.57.3
transformers==4.57.3
# pip install git+https://github.com/yiliu30/long-bench-eval
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,27 @@ fi
# Extract model name and set output directory
MODEL_NAME=$(basename ${MODEL_PATH})
OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval"

# Create output directory
mkdir -p ${OUTPUT_DIR}


SERVER_PORT=8000
max_length=8192
max_gen_toks=2048

# update max_length based on the task
if [[ "$TASK_NAME" == *"longbench"* ]]; then
max_length=131072 # 128k
max_gen_toks=2048
fi

max_ctx_length=$((max_length - max_gen_toks))

echo "max_length: ${max_length}"
echo "max_gen_toks: ${max_gen_toks}"
echo "max_ctx_length: ${max_ctx_length}"


#FIXME: (yiliu30) remove these envs once we have fixed the pynccl issues
export NCCL_NVLS_ENABLE=0
# export VLLM_DISABLE_PYNCCL=1
Expand Down Expand Up @@ -141,19 +159,109 @@ echo "Tensor parallelism size: ${TP_SIZE}"
echo "Batch size: ${BATCH_SIZE}"
echo "Output directory: ${OUTPUT_DIR}"

VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \
VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \
VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \
VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \
VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt


# Export vLLM environment variables
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT
export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS
export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE
export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM
export VLLM_ENABLE_V1_MULTIPROCESSING=0



# Function to run standard lm-eval tasks
run_standard_eval() {
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
}

# Function to start vLLM server
start_vllm_server() {
echo "Starting vLLM server on port ${SERVER_PORT}..."
vllm serve ${MODEL_PATH} \
--port ${SERVER_PORT} \
--tensor-parallel-size ${TP_SIZE} \
--max-model-len ${max_length} \
--gpu-memory-utilization 0.8 \
--dtype bfloat16 \
--kv-cache-dtype ${KV_CACHE_DTYPE} \
--disable-log-requests \
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &

VLLM_PID=$!
echo "vLLM server started with PID: ${VLLM_PID}"
}

# Function to wait for vLLM server to be ready
wait_for_server() {
local max_retries=300
local retry_count=0

echo "Waiting for vLLM server to be ready..."
while [ $retry_count -lt $max_retries ]; do
if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then
echo "vLLM server is ready!"
return 0
fi
retry_count=$((retry_count + 1))
echo "Waiting for server... (${retry_count}/${max_retries})"
sleep 5
done

echo "Error: vLLM server failed to start within expected time"
echo "Check ${OUTPUT_DIR}/vllm_server.log for details"
return 1
}

# Function to cleanup vLLM server on exit
cleanup_server() {
echo "Shutting down vLLM server..."
kill $VLLM_PID 2>/dev/null || true
wait $VLLM_PID 2>/dev/null || true
echo "Server stopped"
}



# Function to run longbench evaluation via API
run_longbench_eval() {
start_vllm_server

if ! wait_for_server; then
kill $VLLM_PID 2>/dev/null || true
exit 1
fi

# Setup cleanup trap
trap cleanup_server EXIT INT TERM

# Run LongBench evaluation
echo "Running LongBench evaluation against vLLM server..."
python -m long_bench_eval.cli \
--api-key dummy \
--base-url http://localhost:${SERVER_PORT}/v1 \
--model ${MODEL_PATH} \
--max-context-length ${max_ctx_length} \
--num-threads 512

echo "Evaluation completed! Results saved to ${OUTPUT_DIR}"
}

# Main evaluation logic
if [[ "$TASK_NAME" == *"longbench"* ]]; then
echo "Running LongBench v2 evaluation..."
run_longbench_eval
else
echo "Running standard lm-eval tasks..."
run_standard_eval
fi