Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 40 additions & 5 deletions .github/workflows/build-swebench-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,33 @@ jobs:
run: |
make build

- name: "Preflight: prune cache and verify BuildKit disk"
run: |
set -euo pipefail
KEEP_GB=120
echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=12h)..."
docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=12h || true

if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
LINE=$(tail -n1 /tmp/buildkit_df)
TOTAL=$(echo "$LINE" | awk '{print $2}')
USED=$(echo "$LINE" | awk '{print $3}')
FREE=$(echo "$LINE" | awk '{print $4}')
if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
PCT=$(( 100 * USED / TOTAL ))
echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
MIN=$((75 * 1024 * 1024 * 1024))
if [ "$FREE" -lt "$MIN" ]; then
echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})"
exit 1
fi
else
echo "Warning: unable to parse df output for /var/lib/buildkit"
fi
else
echo "Warning: /var/lib/buildkit not found; skipping disk check"
fi

- name: Build and push SWE-Bench images
run: |
set -euo pipefail
Expand Down Expand Up @@ -270,14 +297,22 @@ jobs:
if [ "$FAILURES" -gt 0 ]; then
echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
cat $MANIFEST_FILES | python -c "
import sys
cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
import json
import sys

for line in sys.stdin:
data = json.loads(line.strip())
if data.get('error') is not None or len(data.get('tags', [])) == 0:
print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\")
" >> "$GITHUB_STEP_SUMMARY"
if data.get("error") or not data.get("tags"):
base = data.get("base_image", "unknown")
err = data.get("error") or "No tags generated"
print(f"- `{base}`: {err}")
PY
fi

if [ "$FAILURES" -gt 0 ]; then
echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL"
exit 1
fi

- name: Comment on tracker issue
Expand Down
180 changes: 112 additions & 68 deletions benchmarks/utils/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import argparse
import contextlib
import io
import os
import subprocess
import sys
import time
Expand All @@ -20,9 +21,12 @@
from tqdm.auto import tqdm

from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.buildx_utils import maybe_reset_buildkit
from benchmarks.utils.buildx_utils import (
buildkit_disk_usage,
maybe_prune_buildkit_cache,
maybe_reset_buildkit,
)
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.image_utils import image_exists
from openhands.agent_server.docker.build import BuildOptions, TargetType, build
from openhands.sdk import get_logger

Expand Down Expand Up @@ -293,11 +297,6 @@ def build_image(
git_sha=git_sha,
sdk_version=sdk_version,
)
for t in opts.all_tags:
# Check if image exists or not
if image_exists(t):
logger.info("Image %s already exists. Skipping build.", t)
return BuildOutput(base_image=base_image, tags=[t], error=None)
tags = build(opts)
return BuildOutput(base_image=base_image, tags=tags, error=None)

Expand Down Expand Up @@ -443,9 +442,24 @@ def build_all_images(

successes = 0
failures = 0
in_progress: set[str] = set()
mu = Lock()

# Batch/prune settings (tunable via env to control disk usage on sticky runners)
batch_size = int(os.getenv("BUILD_BATCH_SIZE", "25"))
prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "120"))
prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "70"))
prune_filters: list[str] | None = ["unused-for=12h"]

def _chunks(seq: list[str], size: int):
if size <= 0:
yield seq
return
for i in range(0, len(seq), size):
yield seq[i : i + size]

batches = list(_chunks(base_images, batch_size or len(base_images)))
total_batches = len(batches)

with (
manifest_file.open("w") as writer,
tqdm(
Expand All @@ -454,78 +468,108 @@ def build_all_images(
):
_update_pbar(pbar, successes, failures, 0, None, "Queueing")

# Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
# even if it's 1. Using processes instead of threads ensures proper isolation
# of stdout/stderr and logging handlers, preventing output mixing between builds.
with ProcessPoolExecutor(max_workers=max_workers) as ex:
futures = {}
for base in base_images:
in_progress.add(base)
# Resolve custom tags before scheduling to avoid pickling issues with closures.
resolved_tag = (
base_image_to_custom_tag_fn(base)
if base_image_to_custom_tag_fn
else ""
)
fut = ex.submit(
_build_with_logging,
log_dir=build_log_dir,
base_image=base,
target_image=image,
custom_tag=resolved_tag,
target=target,
push=push,
max_retries=max_retries,
post_build_fn=post_build_fn,
)
futures[fut] = base

_update_pbar(
pbar,
successes,
failures,
len(in_progress),
next(iter(in_progress), None),
"Running",
)
for batch_idx, batch in enumerate(batches, start=1):
if not batch:
continue

for fut in as_completed(futures):
base = futures[fut]
try:
result: BuildOutput = fut.result()
writer.write(result.model_dump_json() + "\n")
writer.flush()
with mu:
successes += 1
_update_pbar(
pbar, successes, failures, len(in_progress), base, "✅ Done"
logger.info(
"Starting batch %d/%d (%d images)", batch_idx, total_batches, len(batch)
)
in_progress: set[str] = set()

with ProcessPoolExecutor(max_workers=max_workers) as ex:
futures = {}
for base in batch:
in_progress.add(base)
resolved_tag = (
base_image_to_custom_tag_fn(base)
if base_image_to_custom_tag_fn
else ""
)
except Exception as e:
logger.error("Build failed for %s: %r", base, e)
# Write a failure line to manifest; keep going.
writer.write(
BuildOutput(
base_image=base, tags=[], error=repr(e)
).model_dump_json()
+ "\n"
fut = ex.submit(
_build_with_logging,
log_dir=build_log_dir,
base_image=base,
target_image=image,
custom_tag=resolved_tag,
target=target,
push=push,
max_retries=max_retries,
post_build_fn=post_build_fn,
)
futures[fut] = base

_update_pbar(
pbar,
successes,
failures,
len(in_progress),
next(iter(in_progress), None),
f"Batch {batch_idx}/{total_batches} running",
)

for fut in as_completed(futures):
base = futures[fut]
status = None
try:
result: BuildOutput = fut.result()
except Exception as e:
logger.error("Build failed for %s: %r", base, e)
result = BuildOutput(base_image=base, tags=[], error=repr(e))

writer.write(result.model_dump_json() + "\n")
writer.flush()

with mu:
failures += 1
_update_pbar(
pbar, successes, failures, len(in_progress), base, "❌ Failed"
)
finally:
with mu:
in_progress.discard(base)
if result.error or not result.tags:
failures += 1
status = "❌ Failed"
else:
successes += 1
status = "✅ Done"

in_progress.discard(base)
pbar.update(1)
_update_pbar(
pbar,
successes,
failures,
len(in_progress),
next(iter(in_progress), None),
None,
status,
)

used, total = buildkit_disk_usage()
if total > 0:
logger.info(
"BuildKit usage after batch %d/%d: %.2f%% (%0.2f GiB / %0.2f GiB)",
batch_idx,
total_batches,
(used / total) * 100,
used / (1 << 30),
total / (1 << 30),
)

if prune_keep_storage_gb and prune_keep_storage_gb > 0:
pruned = maybe_prune_buildkit_cache(
keep_storage_gb=prune_keep_storage_gb,
threshold_pct=prune_threshold_pct,
filters=prune_filters,
)
if pruned:
logger.info(
"Pruned BuildKit cache after batch %d/%d (keep=%d GiB, threshold=%.1f%%)",
batch_idx,
total_batches,
prune_keep_storage_gb,
prune_threshold_pct,
)
else:
logger.info(
"No prune needed after batch %d/%d (threshold %.1f%%)",
batch_idx,
total_batches,
prune_threshold_pct,
)
logger.info(
"Done. Built=%d Failed=%d Manifest=%s",
Expand Down
79 changes: 79 additions & 0 deletions benchmarks/utils/buildx_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import os
import re
import shutil
import subprocess
import time
from pathlib import Path
Expand Down Expand Up @@ -143,3 +144,81 @@ def maybe_reset_buildkit(
reset_buildkit("partial", base_image, target_image)
else:
reset_buildkit("full", base_image, target_image)


def buildkit_disk_usage(root: str | Path = "/var/lib/buildkit") -> tuple[int, int]:
"""
Return (used_bytes, total_bytes) for the BuildKit root. Missing path -> (0, 0).
"""
path = Path(root)
try:
usage = shutil.disk_usage(path)
return usage.used, usage.total
except FileNotFoundError:
logger.warning("BuildKit root %s not found when checking disk usage", path)
except Exception as e:
logger.warning("Unable to read disk usage for %s: %s", path, e)
return 0, 0


def prune_buildkit_cache(
keep_storage_gb: int | None = None,
filters: list[str] | None = None,
) -> None:
"""
Run docker buildx prune to free space on the BuildKit cache.
keep_storage_gb: amount of cache to keep (pass None to keep default behavior).
filters: optional list of buildx prune --filter values.
"""
cmd = ["docker", "buildx", "prune", "--all", "--force"]
if keep_storage_gb is not None and keep_storage_gb > 0:
cmd += ["--keep-storage", f"{keep_storage_gb}g"]
if filters:
for f in filters:
cmd += ["--filter", f]

logger.info("Pruning BuildKit cache: %s", " ".join(cmd))
proc = subprocess.run(cmd, text=True, capture_output=True)
if proc.stdout:
logger.info(proc.stdout.strip())
if proc.stderr:
logger.warning(proc.stderr.strip())
if proc.returncode != 0:
raise RuntimeError(
proc.stderr.strip()
or proc.stdout.strip()
or f"docker buildx prune failed with exit code {proc.returncode}"
)


def maybe_prune_buildkit_cache(
keep_storage_gb: int,
threshold_pct: float,
filters: list[str] | None = None,
root: str | Path = "/var/lib/buildkit",
) -> bool:
"""
Prune cache if disk usage exceeds threshold_pct (0-100).
Returns True if a prune was attempted.
"""
used, total = buildkit_disk_usage(root)
if total <= 0:
logger.warning("Skipping BuildKit prune; unable to determine disk usage.")
return False

usage_pct = (used / total) * 100
logger.info(
"BuildKit disk usage: %.2f%% (%0.2f GiB used / %0.2f GiB total)",
usage_pct,
used / (1 << 30),
total / (1 << 30),
)
if usage_pct < threshold_pct:
return False

try:
prune_buildkit_cache(keep_storage_gb=keep_storage_gb, filters=filters)
return True
except Exception as e:
logger.warning("Failed to prune BuildKit cache: %s", e)
return False
Loading