cog.yaml

# Configuration for Cog ⚙️
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md

build:
  gpu: true
  cuda: "12.1"

  python_version: "3.11.9"

  python_requirements: requirements.txt

  run:
    - --mount=type=cache,target=/root/.cache/pip TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0" CUDA_HOME=/usr/local/cuda pip install --ignore-installed vllm==0.5.3.post1
    - --mount=type=cache,target=/root/.cache/pip pip install cog==0.10.0a18
    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
    - sed -i "s/from vllm.model_executor.layers.quantization.schema import QuantParamSchema/# from vllm.model_executor.layers.quantization.schema import QuantParamSchema/" /root/.pyenv/versions/3.11.9/lib/python3.11/site-packages/vllm/model_executor/model_loader/weight_utils.py

predict: "predict.py:Predictor"
train: "train.py:train"

concurrency:
  max: 32