diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt index dc850c91a45..c072927a33e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -2,4 +2,6 @@ lm-eval==0.4.10 loguru compressed-tensors==0.12.2 hf_transfer -transformers==4.57.3 \ No newline at end of file +transformers==4.57.3 +# pip install git+https://github.com/yiliu30/long-bench-eval +long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 109b921a82e..3a28037507e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -78,9 +78,27 @@ fi # Extract model name and set output directory MODEL_NAME=$(basename ${MODEL_PATH}) OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" - # Create output directory mkdir -p ${OUTPUT_DIR} + + +SERVER_PORT=8000 +max_length=8192 +max_gen_toks=2048 + +# update max_length based on the task +if [[ "$TASK_NAME" == *"longbench"* ]]; then + max_length=131072 # 128k + max_gen_toks=2048 +fi + +max_ctx_length=$((max_length - max_gen_toks)) + +echo "max_length: ${max_length}" +echo "max_gen_toks: ${max_gen_toks}" +echo "max_ctx_length: ${max_ctx_length}" + + #FIXME: (yiliu30) remove these envs once we have fixed the pynccl issues export NCCL_NVLS_ENABLE=0 # export VLLM_DISABLE_PYNCCL=1 @@ -141,19 +159,109 @@ echo "Tensor parallelism size: ${TP_SIZE}" echo "Batch size: ${BATCH_SIZE}" echo "Output directory: ${OUTPUT_DIR}" -VLLM_WORKER_MULTIPROC_METHOD=spawn \ -VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \ -VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \ -VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \ -VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \ -VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ -VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ -VLLM_ENABLE_V1_MULTIPROCESSING=1 \ -lm_eval --model vllm \ - --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ - --tasks $TASK_NAME \ - --batch_size $BATCH_SIZE \ - --log_samples \ - --seed 42 \ - --output_path ${OUTPUT_DIR} \ - --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt + + +# Export vLLM environment variables +export VLLM_WORKER_MULTIPROC_METHOD=spawn +export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT +export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE +export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 +export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS +export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE +export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM +export VLLM_ENABLE_V1_MULTIPROCESSING=0 + + + +# Function to run standard lm-eval tasks +run_standard_eval() { + lm_eval --model vllm \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ + --tasks $TASK_NAME \ + --batch_size $BATCH_SIZE \ + --log_samples \ + --seed 42 \ + --output_path ${OUTPUT_DIR} \ + --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt +} + +# Function to start vLLM server +start_vllm_server() { + echo "Starting vLLM server on port ${SERVER_PORT}..." + vllm serve ${MODEL_PATH} \ + --port ${SERVER_PORT} \ + --tensor-parallel-size ${TP_SIZE} \ + --max-model-len ${max_length} \ + --gpu-memory-utilization 0.8 \ + --dtype bfloat16 \ + --kv-cache-dtype ${KV_CACHE_DTYPE} \ + --disable-log-requests \ + > ${OUTPUT_DIR}/vllm_server.log 2>&1 & + + VLLM_PID=$! + echo "vLLM server started with PID: ${VLLM_PID}" +} + +# Function to wait for vLLM server to be ready +wait_for_server() { + local max_retries=300 + local retry_count=0 + + echo "Waiting for vLLM server to be ready..." + while [ $retry_count -lt $max_retries ]; do + if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then + echo "vLLM server is ready!" + return 0 + fi + retry_count=$((retry_count + 1)) + echo "Waiting for server... (${retry_count}/${max_retries})" + sleep 5 + done + + echo "Error: vLLM server failed to start within expected time" + echo "Check ${OUTPUT_DIR}/vllm_server.log for details" + return 1 +} + +# Function to cleanup vLLM server on exit +cleanup_server() { + echo "Shutting down vLLM server..." + kill $VLLM_PID 2>/dev/null || true + wait $VLLM_PID 2>/dev/null || true + echo "Server stopped" +} + + + +# Function to run longbench evaluation via API +run_longbench_eval() { + start_vllm_server + + if ! wait_for_server; then + kill $VLLM_PID 2>/dev/null || true + exit 1 + fi + + # Setup cleanup trap + trap cleanup_server EXIT INT TERM + + # Run LongBench evaluation + echo "Running LongBench evaluation against vLLM server..." + python -m long_bench_eval.cli \ + --api-key dummy \ + --base-url http://localhost:${SERVER_PORT}/v1 \ + --model ${MODEL_PATH} \ + --max-context-length ${max_ctx_length} \ + --num-threads 512 + + echo "Evaluation completed! Results saved to ${OUTPUT_DIR}" +} + +# Main evaluation logic +if [[ "$TASK_NAME" == *"longbench"* ]]; then + echo "Running LongBench v2 evaluation..." + run_longbench_eval +else + echo "Running standard lm-eval tasks..." + run_standard_eval +fi