Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions .github/actions/aicr-build/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,22 @@ runs:
- name: Build snapshot agent image and load into kind
shell: bash
run: |
# Build snapshot agent image with CUDA runtime (provides nvidia-smi for GPU detection).
# Build snapshot agent image with CUDA base (provides nvidia-smi for GPU detection).
# Uses cuda:base (~250MB) instead of cuda:runtime (~1.8GB) — only nvidia-smi is needed.
# GPU test workflows use --image=ko.local:smoke-test for aicr snapshot.
CGO_ENABLED=0 go build -trimpath -o dist/aicr ./cmd/aicr
docker build -t ko.local:smoke-test -f - . <<'DOCKERFILE'
FROM nvcr.io/nvidia/cuda:13.1.0-runtime-ubuntu24.04
FROM nvcr.io/nvidia/cuda:13.1.0-base-ubuntu24.04
COPY dist/aicr /usr/local/bin/aicr
ENTRYPOINT ["/usr/local/bin/aicr"]
DOCKERFILE
kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"

# Load onto all nodes (validator Jobs tolerate all taints and may schedule on
# control-plane). The cuda:base image is ~250MB so 3-node loads are fast.
timeout 300 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}" || {
echo "::warning::kind load attempt 1 failed for ko.local:smoke-test, retrying..."
timeout 300 kind load docker-image ko.local:smoke-test --name "${KIND_CLUSTER_NAME}"
}

- name: Build validator images and load into kind
shell: bash
Expand All @@ -48,6 +55,7 @@ runs:
CGO_ENABLED=0 go build -trimpath -o dist/validator/deployment ./validators/deployment
CGO_ENABLED=0 go build -trimpath -o dist/validator/performance ./validators/performance
CGO_ENABLED=0 go build -trimpath -o dist/validator/conformance ./validators/conformance

for phase in deployment performance conformance; do
mkdir -p "validators/${phase}/testdata"
docker build -t "ko.local/aicr-validators/${phase}:latest" -f - . <<DOCKERFILE
Expand All @@ -58,7 +66,10 @@ runs:
USER nonroot
ENTRYPOINT ["/${phase}"]
DOCKERFILE
kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}" || {
echo "::warning::kind load attempt 1 failed for ko.local/aicr-validators/${phase}:latest, retrying..."
timeout 300 kind load docker-image "ko.local/aicr-validators/${phase}:latest" --name "${KIND_CLUSTER_NAME}"
}
done

- name: Build aicr binary
Expand Down
17 changes: 13 additions & 4 deletions pkg/validator/job/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,11 +252,20 @@ func serializeTolerations(tols []corev1.Toleration) string {
return strings.Join(parts, ",")
}

// imagePullPolicy returns Always when the image uses :latest tag (dev builds),
// PullIfNotPresent otherwise. This ensures dev builds always pull fresh images
// and avoids exec format errors from stale cached images on cluster nodes.
// imagePullPolicy returns the appropriate pull policy based on the image reference.
// Side-loaded images (ko.local, kind.local) use Never since they are loaded
// via `kind load docker-image` and no registry exists to pull from.
// All other images (including localhost registry) follow the standard policy:
// :latest tag uses Always to ensure fresh images, versioned tags use IfNotPresent.
func (d *Deployer) imagePullPolicy() corev1.PullPolicy {
if strings.HasSuffix(d.entry.Image, ":latest") {
img := d.entry.Image
// Side-loaded images via kind load — no registry exists, never pull.
if strings.HasPrefix(img, "ko.local") ||
strings.HasPrefix(img, "kind.local") {

return corev1.PullNever
}
if strings.HasSuffix(img, ":latest") {
return corev1.PullAlways
}
return corev1.PullIfNotPresent
Expand Down
4 changes: 4 additions & 0 deletions pkg/validator/job/deployer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,10 @@ func TestImagePullPolicy(t *testing.T) {
}{
{"latest tag uses Always", "ghcr.io/nvidia/aicr-validators/conformance:latest", corev1.PullAlways},
{"versioned tag uses IfNotPresent", "ghcr.io/nvidia/aicr-validators/conformance:v1.0.0", corev1.PullIfNotPresent},
{"ko.local uses Never", "ko.local/aicr-validators/conformance:latest", corev1.PullNever},
{"kind.local uses Never", "kind.local/aicr-validators/conformance:latest", corev1.PullNever},
{"localhost registry with latest uses Always", "localhost:5001/aicr-validators/conformance:latest", corev1.PullAlways},
{"localhost registry versioned uses IfNotPresent", "localhost:5001/aicr-validators/conformance:v1.0.0", corev1.PullIfNotPresent},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
Expand Down
Loading