-
Notifications
You must be signed in to change notification settings - Fork 698
Expand file tree
/
Copy pathbuild-images
More file actions
executable file
·104 lines (87 loc) · 3.58 KB
/
build-images
File metadata and controls
executable file
·104 lines (87 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env bash
set -o errexit
set -o nounset
set -o pipefail
REPO_ROOT="$(git rev-parse --show-toplevel)"
SRC_DIR=${REPO_ROOT}/modelserving
cd "${SRC_DIR}"
if [[ -z "${IMAGE_PREFIX:-}" ]]; then
IMAGE_PREFIX=""
fi
echo "Building images with prefix ${IMAGE_PREFIX}"
if [[ -z "${TAG:-}" ]]; then
TAG=latest
fi
if [[ -z "${ARCHITECTURES:-}" ]]; then
ARCHITECTURES=cpu,cuda
fi
echo "Building for architectures: ${ARCHITECTURES}"
LLAMACPP_TAG=b4957
echo "Building llama.cpp version ${LLAMACPP_TAG}"
function build_for_architecture() {
a=",${ARCHITECTURES:-},"
if [[ "${a}" =~ ",${1}," ]]; then
return 0
fi
return 1
}
if [[ -z "${BUILDX_ARGS:-}" ]]; then
BUILDX_ARGS="--load"
fi
dev/tasks/download-model
# Note we do not push or load the "base" llama-server images (we do not pass BUILDX_ARGS)
# This is because this is only an intermediate image (e.g. used for the gemma3-12b-it image)
if build_for_architecture cpu; then
docker buildx build \
-f images/llamacpp-server/Dockerfile \
--target llamacpp-server \
-t llamacpp-server-cpu:${TAG} \
--build-arg BASE_IMAGE=debian:latest \
--build-arg BUILDER_IMAGE=debian:latest \
--build-arg "CMAKE_ARGS=-DGGML_RPC=ON" \
--progress=plain .
fi
# We're running distributed now, so the "worker" nodes need CUDA (rpc-server image), the "head" nodes do not (llamacpp-server image).
# # -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined allows us to build in a container without all the CUDA libraries present
# # These flags mirror the flags in the llama.cpp github-action: https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/build.yml
# docker buildx build \
# -f images/llamacpp-server/Dockerfile \
# --target llamacpp-server \
# -t llamacpp-server-cuda:${TAG} \
# --build-arg BASE_IMAGE=nvidia/cuda:12.6.2-runtime-ubuntu24.04 \
# --build-arg BUILDER_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 \
# --build-arg "CMAKE_ARGS=-DGGML_RPC=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined" \
# --progress=plain .
# fi
# Build a head node that embeds gemma3
if build_for_architecture cpu; then
docker buildx build ${BUILDX_ARGS} \
-f images/llamacpp-gemma3-12b-it/Dockerfile \
-t ${IMAGE_PREFIX}llamacpp-gemma3-12b-it-cpu:${TAG} \
--build-arg BASE_IMAGE=llamacpp-server-cpu:${TAG} \
--progress=plain .
fi
# Build a worker node that runs rpc-server with CPU support
if build_for_architecture cpu; then
docker buildx build ${BUILDX_ARGS} \
-f images/llamacpp-server/Dockerfile \
--target rpc-server \
-t ${IMAGE_PREFIX}rpc-server-cpu:${TAG} \
--build-arg BASE_IMAGE=debian:latest \
--build-arg BUILDER_IMAGE=debian:latest \
--build-arg "CMAKE_ARGS=-DGGML_RPC=ON" \
--progress=plain .
fi
# Build a worker node that runs rpc-server with CUDA support
if build_for_architecture cuda; then
# -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined allows us to build in a container without all the CUDA libraries present
# These flags mirror the flags in the llama.cpp github-action: https://github.com/ggml-org/llama.cpp/blob/master/.github/workflows/build.yml
docker buildx build ${BUILDX_ARGS} \
-f images/llamacpp-server/Dockerfile \
--target rpc-server \
-t ${IMAGE_PREFIX}rpc-server-cuda:${TAG} \
--build-arg BASE_IMAGE=nvidia/cuda:12.6.2-runtime-ubuntu24.04 \
--build-arg BUILDER_IMAGE=nvidia/cuda:12.6.2-devel-ubuntu24.04 \
--build-arg "CMAKE_ARGS=-DGGML_RPC=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined" \
--progress=plain .
fi