kubectl-ai/modelserving/dev/tasks/build-images at main · GoogleCloudPlatform/kubectl-ai · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env bash

set -o errexit
set -o nounset
set -o pipefail

REPO_ROOT="$(git rev-parse --show-toplevel)"
SRC_DIR=${REPO_ROOT}/modelserving
cd "${SRC_DIR}"

if [[ -z "${IMAGE_PREFIX:-}" ]]; then
  IMAGE_PREFIX=""
fi
echo "Building images with prefix ${IMAGE_PREFIX}"

if [[ -z "${TAG:-}" ]]; then
  TAG=latest
fi

if [[ -z "${ARCHITECTURES:-}" ]]; then
  ARCHITECTURES=cpu,cuda
fi
echo "Building for architectures: ${ARCHITECTURES}"

LLAMACPP_TAG=b4957
echo "Building llama.cpp version ${LLAMACPP_TAG}"

function build_for_architecture() {
  a=",${ARCHITECTURES:-},"
  if [[ "${a}" =~ ",${1}," ]]; then
    return 0
  fi
  return 1
}

if [[ -z "${BUILDX_ARGS:-}" ]]; then
  BUILDX_ARGS="--load"
fi

dev/tasks/download-model

# Note we do not push or load the "base" llama-server images (we do not pass BUILDX_ARGS)
# This is because this is only an intermediate image (e.g. used for the gemma3-12b-it image)
if build_for_architecture cpu; then
  docker buildx build \
    -f images/llamacpp-server/Dockerfile \
    --target llamacpp-server \
    -t llamacpp-server-cpu:${TAG} \
    --build-arg BASE_IMAGE=debian:latest \
    --build-arg BUILDER_IMAGE=debian:latest \
    --build-arg "CMAKE_ARGS=-DGGML_RPC=ON" \
    --progress=plain .
fi

# We're running distributed now, so the "worker" nodes need CUDA (rpc-server image), the "head" nodes do not (llamacpp-server image).
#   # -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined allows us to build in a container without all the CUDA libraries present
#   # These flags mirror the flags in the llama.cpp github-action: https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/build.yml

#   docker buildx build \
#     -f images/llamacpp-server/Dockerfile \
#     --target llamacpp-server \
#     -t llamacpp-server-cuda:${TAG} \
#     --build-arg BASE_IMAGE=nvidia/cuda:12.6.2-runtime-ubuntu24.04 \
#     --build-arg BUILDER_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 \
#     --build-arg "CMAKE_ARGS=-DGGML_RPC=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined" \
#     --progress=plain .
# fi

# Build a head node that embeds gemma3
if build_for_architecture cpu; then
  docker buildx build ${BUILDX_ARGS} \
    -f images/llamacpp-gemma3-12b-it/Dockerfile \
    -t ${IMAGE_PREFIX}llamacpp-gemma3-12b-it-cpu:${TAG} \
    --build-arg BASE_IMAGE=llamacpp-server-cpu:${TAG} \
    --progress=plain .
fi


# Build a worker node that runs rpc-server with CPU support
if build_for_architecture cpu; then
  docker buildx build ${BUILDX_ARGS} \
    -f images/llamacpp-server/Dockerfile \
    --target rpc-server \
    -t ${IMAGE_PREFIX}rpc-server-cpu:${TAG} \
    --build-arg BASE_IMAGE=debian:latest \
    --build-arg BUILDER_IMAGE=debian:latest \
    --build-arg "CMAKE_ARGS=-DGGML_RPC=ON" \
    --progress=plain .
fi

# Build a worker node that runs rpc-server with CUDA support
if build_for_architecture cuda; then
  # -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined allows us to build in a container without all the CUDA libraries present
  # These flags mirror the flags in the llama.cpp github-action: https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/build.yml

  docker buildx build ${BUILDX_ARGS} \
    -f images/llamacpp-server/Dockerfile \
    --target rpc-server \
    -t ${IMAGE_PREFIX}rpc-server-cuda:${TAG} \
    --build-arg BASE_IMAGE=nvidia/cuda:12.6.2-runtime-ubuntu24.04 \
    --build-arg BUILDER_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 \
    --build-arg "CMAKE_ARGS=-DGGML_RPC=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined" \
    --progress=plain .
fi