Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ lm-eval==0.4.10
loguru
compressed-tensors==0.12.2
hf_transfer
transformers==4.57.3
transformers==4.57.3
# pip install git+https://github.com/yiliu30/long-bench-eval
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,27 @@ fi
# Extract model name and set output directory
MODEL_NAME=$(basename ${MODEL_PATH})
OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval"

# Create output directory
mkdir -p ${OUTPUT_DIR}


SERVER_PORT=8000
max_length=8192
max_gen_toks=2048

# update max_length based on the task
if [[ "$TASK_NAME" == *"longbench"* ]]; then
max_length=131072 # 128k
max_gen_toks=2048
fi

max_ctx_length=$((max_length - max_gen_toks))

echo "max_length: ${max_length}"
echo "max_gen_toks: ${max_gen_toks}"
echo "max_ctx_length: ${max_ctx_length}"


#FIXME: (yiliu30) remove these envs once we have fixed the pynccl issues
export NCCL_NVLS_ENABLE=0
# export VLLM_DISABLE_PYNCCL=1
Expand Down Expand Up @@ -141,19 +159,111 @@ echo "Tensor parallelism size: ${TP_SIZE}"
echo "Batch size: ${BATCH_SIZE}"
echo "Output directory: ${OUTPUT_DIR}"

VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \
VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \
VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \
VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \
VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt


# Export vLLM environment variables
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT
export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS
export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE
export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM
export VLLM_ENABLE_V1_MULTIPROCESSING=0



# Function to run standard lm-eval tasks
run_standard_eval() {
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
}

# Function to start vLLM server
start_vllm_server() {
echo "Starting vLLM server on port ${SERVER_PORT}..."
vllm serve ${MODEL_PATH} \
--port ${SERVER_PORT} \
--tensor-parallel-size ${TP_SIZE} \
--max-model-len ${max_length} \
--gpu-memory-utilization 0.8 \
--dtype bfloat16 \
--kv-cache-dtype ${KV_CACHE_DTYPE} \
--disable-log-requests \
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &

VLLM_PID=$!
echo "vLLM server started with PID: ${VLLM_PID}"
}

# Function to wait for vLLM server to be ready
wait_for_server() {
local max_retries=300
local retry_count=0

echo "Waiting for vLLM server to be ready..."
while [ $retry_count -lt $max_retries ]; do
if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then
echo "vLLM server is ready!"
return 0
fi
retry_count=$((retry_count + 1))
echo "Waiting for server... (${retry_count}/${max_retries})"
sleep 5
done

echo "Error: vLLM server failed to start within expected time"
echo "Check ${OUTPUT_DIR}/vllm_server.log for details"
return 1
}

# Function to cleanup vLLM server on exit
cleanup_server() {
echo "Shutting down vLLM server..."
kill $VLLM_PID 2>/dev/null || true
wait $VLLM_PID 2>/dev/null || true
echo "Server stopped"
}



# Function to run longbench evaluation via API
run_longbench_eval() {
start_vllm_server

if ! wait_for_server; then
kill $VLLM_PID 2>/dev/null || true
exit 1
fi

# Setup cleanup trap
trap cleanup_server EXIT INT TERM

# Run LongBench evaluation
echo "Running LongBench evaluation against vLLM server..."
python -m long_bench_eval.cli \
--api-key dummy \
--base-url http://localhost:${SERVER_PORT}/v1 \
--model ${MODEL_PATH} \
--max-context-length ${max_ctx_length} \
--num-threads 1 \
--deterministic \
--categories "Long In-context Learning"

echo "Evaluation completed! Results saved to ${OUTPUT_DIR}"
}

# Main evaluation logic
if [[ "$TASK_NAME" == *"longbench"* ]]; then
echo "Running LongBench v2 evaluation..."
run_longbench_eval
else
echo "Running standard lm-eval tasks..."
run_standard_eval
fi
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
lm-eval==0.4.10
loguru
hf_transfer
transformers==4.57.3
transformers==4.57.3
# pip install git+https://github.com/yiliu30/long-bench-eval
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,27 @@ fi
# Extract model name and set output directory
MODEL_NAME=$(basename ${MODEL_PATH})
OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval"

# Create output directory
mkdir -p ${OUTPUT_DIR}


SERVER_PORT=8000
max_length=8192
max_gen_toks=2048

# update max_length based on the task
if [[ "$TASK_NAME" == *"longbench"* ]]; then
max_length=131072
max_gen_toks=2048
fi

max_ctx_length=$((max_length - max_gen_toks))

echo "max_length: ${max_length}"
echo "max_gen_toks: ${max_gen_toks}"
echo "max_ctx_length: ${max_ctx_length}"


# Set environment variables based on the quantization scheme
if [[ "$SCHEME" == "mxfp4" ]]; then
VLLM_AR_MXFP4_MODULAR_MOE=1
Expand Down Expand Up @@ -132,19 +149,122 @@ echo "Tensor parallelism size: ${TP_SIZE}"
echo "Batch size: ${BATCH_SIZE}"
echo "Output directory: ${OUTPUT_DIR}"

VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \
VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \
VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \
VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \
VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt



# lm_eval --model vllm \
# --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
# --tasks $TASK_NAME \
# --batch_size $BATCH_SIZE \
# --log_samples \
# --seed 42 \
# --output_path ${OUTPUT_DIR} \
# --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt


# Export vLLM environment variables
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT
export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS
export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE
export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM
export VLLM_ENABLE_V1_MULTIPROCESSING=0



# Function to run standard lm-eval tasks
run_standard_eval() {
lm_eval --model vllm \
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
--tasks $TASK_NAME \
--batch_size $BATCH_SIZE \
--log_samples \
--seed 42 \
--output_path ${OUTPUT_DIR} \
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
}

# Function to start vLLM server
start_vllm_server() {
echo "Starting vLLM server on port ${SERVER_PORT}..."
vllm serve ${MODEL_PATH} \
--port ${SERVER_PORT} \
--tensor-parallel-size ${TP_SIZE} \
--max-model-len ${max_length} \
--gpu-memory-utilization 0.8 \
--dtype bfloat16 \
--kv-cache-dtype ${KV_CACHE_DTYPE} \
--rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' \
--disable-log-requests \
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &

VLLM_PID=$!
echo "vLLM server started with PID: ${VLLM_PID}"
}

# Function to wait for vLLM server to be ready
wait_for_server() {
local max_retries=300
local retry_count=0

echo "Waiting for vLLM server to be ready..."
while [ $retry_count -lt $max_retries ]; do
if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then
echo "vLLM server is ready!"
return 0
fi
retry_count=$((retry_count + 1))
echo "Waiting for server... (${retry_count}/${max_retries})"
sleep 5
done

echo "Error: vLLM server failed to start within expected time"
echo "Check ${OUTPUT_DIR}/vllm_server.log for details"
return 1
}

# Function to cleanup vLLM server on exit
cleanup_server() {
echo "Shutting down vLLM server..."
kill $VLLM_PID 2>/dev/null || true
wait $VLLM_PID 2>/dev/null || true
echo "Server stopped"
}


# Function to run longbench evaluation via API
run_longbench_eval() {
start_vllm_server

if ! wait_for_server; then
kill $VLLM_PID 2>/dev/null || true
exit 1
fi

# Setup cleanup trap
trap cleanup_server EXIT INT TERM

# Run LongBench evaluation
echo "Running LongBench evaluation against vLLM server..."
python -m long_bench_eval.cli \
--api-key dummy \
--base-url http://localhost:${SERVER_PORT}/v1 \
--model ${MODEL_PATH} \
--max-context-length ${max_ctx_length} \
--num-threads 1 \
--deterministic \
--categories "Long In-context Learning"

echo "Evaluation completed! Results saved to ${OUTPUT_DIR}"
}

# Main evaluation logic
if [[ "$TASK_NAME" == *"longbench"* ]]; then
echo "Running LongBench v2 evaluation..."
run_longbench_eval
else
echo "Running standard lm-eval tasks..."
run_standard_eval
fi