OpenHands · simonrosenberg · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
@@ -185,6 +185,33 @@ jobs:
         run: |
           make build
 
+      - name: "Preflight: prune cache and verify BuildKit disk"
+        run: |
+          set -euo pipefail
+          KEEP_GB=120
+          echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=12h)..."
+          docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=12h || true
+
+          if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
+            LINE=$(tail -n1 /tmp/buildkit_df)
+            TOTAL=$(echo "$LINE" | awk '{print $2}')
+            USED=$(echo "$LINE" | awk '{print $3}')
+            FREE=$(echo "$LINE" | awk '{print $4}')
+            if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
+              PCT=$(( 100 * USED / TOTAL ))
+              echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
+              MIN=$((75 * 1024 * 1024 * 1024))
+              if [ "$FREE" -lt "$MIN" ]; then
+                echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})"
+                exit 1
+              fi
+            else
+              echo "Warning: unable to parse df output for /var/lib/buildkit"
+            fi
+          else
+            echo "Warning: /var/lib/buildkit not found; skipping disk check"
+          fi
+
       - name: Build and push SWE-Bench images
         run: |
           set -euo pipefail
@@ -270,14 +297,22 @@ jobs:
           if [ "$FAILURES" -gt 0 ]; then
             echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
             echo "" >> "$GITHUB_STEP_SUMMARY"
-            cat $MANIFEST_FILES | python -c "
-          import sys
+            cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
           import json
+          import sys
+
           for line in sys.stdin:
               data = json.loads(line.strip())
-              if data.get('error') is not None or len(data.get('tags', [])) == 0:
-                  print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\")
-          " >> "$GITHUB_STEP_SUMMARY"
+              if data.get("error") or not data.get("tags"):
+                  base = data.get("base_image", "unknown")
+                  err = data.get("error") or "No tags generated"
+                  print(f"- `{base}`: {err}")
+          PY
+          fi
+
+          if [ "$FAILURES" -gt 0 ]; then
+            echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL"
+            exit 1
           fi
 
       - name: Comment on tracker issue

diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
@@ -6,6 +6,7 @@
 import argparse
 import contextlib
 import io
+import os
 import subprocess
 import sys
 import time
@@ -20,9 +21,12 @@
 from tqdm.auto import tqdm
 
 from benchmarks.utils.args_parser import get_parser
-from benchmarks.utils.buildx_utils import maybe_reset_buildkit
+from benchmarks.utils.buildx_utils import (
+    buildkit_disk_usage,
+    maybe_prune_buildkit_cache,
+    maybe_reset_buildkit,
+)
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
-from benchmarks.utils.image_utils import image_exists
 from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
@@ -293,11 +297,6 @@ def build_image(
         git_sha=git_sha,
         sdk_version=sdk_version,
     )
-    for t in opts.all_tags:
-        # Check if image exists or not
-        if image_exists(t):
-            logger.info("Image %s already exists. Skipping build.", t)
-            return BuildOutput(base_image=base_image, tags=[t], error=None)
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)
 
@@ -443,9 +442,24 @@ def build_all_images(
 
     successes = 0
     failures = 0
-    in_progress: set[str] = set()
     mu = Lock()
 
+    # Batch/prune settings (tunable via env to control disk usage on sticky runners)
+    batch_size = int(os.getenv("BUILD_BATCH_SIZE", "25"))
+    prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "120"))
+    prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "70"))
+    prune_filters: list[str] | None = ["unused-for=12h"]
+
+    def _chunks(seq: list[str], size: int):
+        if size <= 0:
+            yield seq
+            return
+        for i in range(0, len(seq), size):
+            yield seq[i : i + size]
+
+    batches = list(_chunks(base_images, batch_size or len(base_images)))
+    total_batches = len(batches)
+
     with (
         manifest_file.open("w") as writer,
         tqdm(
@@ -454,78 +468,108 @@ def build_all_images(
     ):
         _update_pbar(pbar, successes, failures, 0, None, "Queueing")
 
-        # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
-        # even if it's 1. Using processes instead of threads ensures proper isolation
-        # of stdout/stderr and logging handlers, preventing output mixing between builds.
-        with ProcessPoolExecutor(max_workers=max_workers) as ex:
-            futures = {}
-            for base in base_images:
-                in_progress.add(base)
-                # Resolve custom tags before scheduling to avoid pickling issues with closures.
-                resolved_tag = (
-                    base_image_to_custom_tag_fn(base)
-                    if base_image_to_custom_tag_fn
-                    else ""
-                )
-                fut = ex.submit(
-                    _build_with_logging,
-                    log_dir=build_log_dir,
-                    base_image=base,
-                    target_image=image,
-                    custom_tag=resolved_tag,
-                    target=target,
-                    push=push,
-                    max_retries=max_retries,
-                    post_build_fn=post_build_fn,
-                )
-                futures[fut] = base
-
-            _update_pbar(
-                pbar,
-                successes,
-                failures,
-                len(in_progress),
-                next(iter(in_progress), None),
-                "Running",
-            )
+        for batch_idx, batch in enumerate(batches, start=1):
+            if not batch:
+                continue
 
-            for fut in as_completed(futures):
-                base = futures[fut]
-                try:
-                    result: BuildOutput = fut.result()
-                    writer.write(result.model_dump_json() + "\n")
-                    writer.flush()
-                    with mu:
-                        successes += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "✅ Done"
+            logger.info(
+                "Starting batch %d/%d (%d images)", batch_idx, total_batches, len(batch)
+            )
+            in_progress: set[str] = set()
+
+            with ProcessPoolExecutor(max_workers=max_workers) as ex:
+                futures = {}
+                for base in batch:
+                    in_progress.add(base)
+                    resolved_tag = (
+                        base_image_to_custom_tag_fn(base)
+                        if base_image_to_custom_tag_fn
+                        else ""
                     )
-                except Exception as e:
-                    logger.error("Build failed for %s: %r", base, e)
-                    # Write a failure line to manifest; keep going.
-                    writer.write(
-                        BuildOutput(
-                            base_image=base, tags=[], error=repr(e)
-                        ).model_dump_json()
-                        + "\n"
+                    fut = ex.submit(
+                        _build_with_logging,
+                        log_dir=build_log_dir,
+                        base_image=base,
+                        target_image=image,
+                        custom_tag=resolved_tag,
+                        target=target,
+                        push=push,
+                        max_retries=max_retries,
+                        post_build_fn=post_build_fn,
                     )
+                    futures[fut] = base
+
+                _update_pbar(
+                    pbar,
+                    successes,
+                    failures,
+                    len(in_progress),
+                    next(iter(in_progress), None),
+                    f"Batch {batch_idx}/{total_batches} running",
+                )
+
+                for fut in as_completed(futures):
+                    base = futures[fut]
+                    status = None
+                    try:
+                        result: BuildOutput = fut.result()
+                    except Exception as e:
+                        logger.error("Build failed for %s: %r", base, e)
+                        result = BuildOutput(base_image=base, tags=[], error=repr(e))
+
+                    writer.write(result.model_dump_json() + "\n")
                     writer.flush()
+
                     with mu:
-                        failures += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "❌ Failed"
-                    )
-                finally:
-                    with mu:
-                        in_progress.discard(base)
+                        if result.error or not result.tags:
+                            failures += 1
+                            status = "❌ Failed"
+                        else:
+                            successes += 1
+                            status = "✅ Done"
+
+                    in_progress.discard(base)
                     pbar.update(1)
                     _update_pbar(
                         pbar,
                         successes,
                         failures,
                         len(in_progress),
                         next(iter(in_progress), None),
-                        None,
+                        status,
+                    )
+
+            used, total = buildkit_disk_usage()
+            if total > 0:
+                logger.info(
+                    "BuildKit usage after batch %d/%d: %.2f%% (%0.2f GiB / %0.2f GiB)",
+                    batch_idx,
+                    total_batches,
+                    (used / total) * 100,
+                    used / (1 << 30),
+                    total / (1 << 30),
+                )
+
+            if prune_keep_storage_gb and prune_keep_storage_gb > 0:
+                pruned = maybe_prune_buildkit_cache(
+                    keep_storage_gb=prune_keep_storage_gb,
+                    threshold_pct=prune_threshold_pct,
+                    filters=prune_filters,
+                )
+                if pruned:
+                    logger.info(
+                        "Pruned BuildKit cache after batch %d/%d (keep=%d GiB, threshold=%.1f%%)",
+                        batch_idx,
+                        total_batches,
+                        prune_keep_storage_gb,
+                        prune_threshold_pct,
+                    )
+                else:
+                    logger.info(
+                        "No prune needed after batch %d/%d (threshold %.1f%%)",
+                        batch_idx,
+                        total_batches,
+                        prune_threshold_pct,
                     )
     logger.info(
         "Done. Built=%d  Failed=%d  Manifest=%s",

diff --git a/benchmarks/utils/buildx_utils.py b/benchmarks/utils/buildx_utils.py
@@ -6,6 +6,7 @@
 import json
 import os
 import re
+import shutil
 import subprocess
 import time
 from pathlib import Path
@@ -143,3 +144,81 @@ def maybe_reset_buildkit(
         reset_buildkit("partial", base_image, target_image)
     else:
         reset_buildkit("full", base_image, target_image)
+
+
+def buildkit_disk_usage(root: str | Path = "/var/lib/buildkit") -> tuple[int, int]:
+    """
+    Return (used_bytes, total_bytes) for the BuildKit root. Missing path -> (0, 0).
+    """
+    path = Path(root)
+    try:
+        usage = shutil.disk_usage(path)
+        return usage.used, usage.total
+    except FileNotFoundError:
+        logger.warning("BuildKit root %s not found when checking disk usage", path)
+    except Exception as e:
+        logger.warning("Unable to read disk usage for %s: %s", path, e)
+    return 0, 0
+
+
+def prune_buildkit_cache(
+    keep_storage_gb: int | None = None,
+    filters: list[str] | None = None,
+) -> None:
+    """
+    Run docker buildx prune to free space on the BuildKit cache.
+    keep_storage_gb: amount of cache to keep (pass None to keep default behavior).
+    filters: optional list of buildx prune --filter values.
+    """
+    cmd = ["docker", "buildx", "prune", "--all", "--force"]
+    if keep_storage_gb is not None and keep_storage_gb > 0:
+        cmd += ["--keep-storage", f"{keep_storage_gb}g"]
+    if filters:
+        for f in filters:
+            cmd += ["--filter", f]
+
+    logger.info("Pruning BuildKit cache: %s", " ".join(cmd))
+    proc = subprocess.run(cmd, text=True, capture_output=True)
+    if proc.stdout:
+        logger.info(proc.stdout.strip())
+    if proc.stderr:
+        logger.warning(proc.stderr.strip())
+    if proc.returncode != 0:
+        raise RuntimeError(
+            proc.stderr.strip()
+            or proc.stdout.strip()
+            or f"docker buildx prune failed with exit code {proc.returncode}"
+        )
+
+
+def maybe_prune_buildkit_cache(
+    keep_storage_gb: int,
+    threshold_pct: float,
+    filters: list[str] | None = None,
+    root: str | Path = "/var/lib/buildkit",
+) -> bool:
+    """
+    Prune cache if disk usage exceeds threshold_pct (0-100).
+    Returns True if a prune was attempted.
+    """
+    used, total = buildkit_disk_usage(root)
+    if total <= 0:
+        logger.warning("Skipping BuildKit prune; unable to determine disk usage.")
+        return False
+
+    usage_pct = (used / total) * 100
+    logger.info(
+        "BuildKit disk usage: %.2f%% (%0.2f GiB used / %0.2f GiB total)",
+        usage_pct,
+        used / (1 << 30),
+        total / (1 << 30),
+    )
+    if usage_pct < threshold_pct:
+        return False
+
+    try:
+        prune_buildkit_cache(keep_storage_gb=keep_storage_gb, filters=filters)
+        return True
+    except Exception as e:
+        logger.warning("Failed to prune BuildKit cache: %s", e)
+        return False