llm/qwen/serve-72b.yaml

envs:
  MODEL_NAME: Qwen/Qwen2-72B-Instruct

service:
  # Specifying the path to the endpoint to check the readiness of the replicas.
  readiness_probe:
    path: /v1/chat/completions
    post_data:
      model: $MODEL_NAME
      messages:
        - role: user
          content: Hello! What is your name?
      max_tokens: 1
    initial_delay_seconds: 1200
  # How many replicas to manage.
  replicas: 2
  

resources:
  accelerators: {A100:8, A100-80GB:4, A100-80GB:8}
  disk_size: 1024
  disk_tier: best
  memory: 32+
  ports: 8000

setup: |
  conda activate qwen
  if [ $? -ne 0 ]; then
    conda create -n qwen python=3.10 -y
    conda activate qwen
  fi
  pip install vllm==0.4.2
  pip install flash-attn==2.5.9.post1

run: |
  conda activate qwen
  export PATH=$PATH:/sbin
  python -u -m vllm.entrypoints.openai.api_server \
    --host 0.0.0.0 \
    --model $MODEL_NAME \
    --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
    --max-num-seqs 16 | tee ~/openai_api_server.log