From 2b03b6383609f2cf68f5bd2984b52802af6063a1 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 7 Jan 2026 10:28:16 +0100 Subject: [PATCH 1/4] Stabilize SWE-bench buildx with batched pruning and fail-fast --- .github/workflows/build-swebench-images.yml | 41 ++++- benchmarks/utils/build_utils.py | 179 ++++++++++++-------- benchmarks/utils/buildx_utils.py | 79 +++++++++ 3 files changed, 223 insertions(+), 76 deletions(-) diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index 78fc2a3b..7c491383 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -185,6 +185,33 @@ jobs: run: | make build + - name: "Preflight: prune cache and verify BuildKit disk" + run: | + set -euo pipefail + KEEP_GB=450 + echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=24h)..." + docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=24h || true + + if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then + LINE=$(tail -n1 /tmp/buildkit_df) + TOTAL=$(echo "$LINE" | awk '{print $2}') + USED=$(echo "$LINE" | awk '{print $3}') + FREE=$(echo "$LINE" | awk '{print $4}') + if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then + PCT=$(( 100 * USED / TOTAL )) + echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes" + MIN=$((75 * 1024 * 1024 * 1024)) + if [ "$FREE" -lt "$MIN" ]; then + echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})" + exit 1 + fi + else + echo "Warning: unable to parse df output for /var/lib/buildkit" + fi + else + echo "Warning: /var/lib/buildkit not found; skipping disk check" + fi + - name: Build and push SWE-Bench images run: | set -euo pipefail @@ -270,14 +297,12 @@ jobs: if [ "$FAILURES" -gt 0 ]; then echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" - cat $MANIFEST_FILES | python -c " - import sys - import json - for line in sys.stdin: - data = json.loads(line.strip()) - if data.get('error') is not None or len(data.get('tags', [])) == 0: - print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\") - " >> "$GITHUB_STEP_SUMMARY" + cat $MANIFEST_FILES | python -c 'import sys,json; [print(f"- `{d.get(\"base_image\",\"unknown\")}`: {d.get(\"error\",\"No tags generated\")}") for d in map(json.loads, sys.stdin) if d.get("error") or not d.get("tags")]' >> "$GITHUB_STEP_SUMMARY" + fi + + if [ "$FAILURES" -gt 0 ]; then + echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL" + exit 1 fi - name: Comment on tracker issue diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index dbca8ced..520500fe 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -20,9 +20,12 @@ from tqdm.auto import tqdm from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.buildx_utils import maybe_reset_buildkit +from benchmarks.utils.buildx_utils import ( + buildkit_disk_usage, + maybe_prune_buildkit_cache, + maybe_reset_buildkit, +) from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE -from benchmarks.utils.image_utils import image_exists from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -293,11 +296,6 @@ def build_image( git_sha=git_sha, sdk_version=sdk_version, ) - for t in opts.all_tags: - # Check if image exists or not - if image_exists(t): - logger.info("Image %s already exists. Skipping build.", t) - return BuildOutput(base_image=base_image, tags=[t], error=None) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None) @@ -443,9 +441,24 @@ def build_all_images( successes = 0 failures = 0 - in_progress: set[str] = set() mu = Lock() + # Fixed batch/prune settings (hardcoded to avoid extra CLI surface) + batch_size = 50 + prune_keep_storage_gb = 450 + prune_threshold_pct = 85.0 + prune_filters: list[str] | None = ["unused-for=24h"] + + def _chunks(seq: list[str], size: int): + if size <= 0: + yield seq + return + for i in range(0, len(seq), size): + yield seq[i : i + size] + + batches = list(_chunks(base_images, batch_size or len(base_images))) + total_batches = len(batches) + with ( manifest_file.open("w") as writer, tqdm( @@ -454,70 +467,67 @@ def build_all_images( ): _update_pbar(pbar, successes, failures, 0, None, "Queueing") - # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ), - # even if it's 1. Using processes instead of threads ensures proper isolation - # of stdout/stderr and logging handlers, preventing output mixing between builds. - with ProcessPoolExecutor(max_workers=max_workers) as ex: - futures = {} - for base in base_images: - in_progress.add(base) - # Resolve custom tags before scheduling to avoid pickling issues with closures. - resolved_tag = ( - base_image_to_custom_tag_fn(base) - if base_image_to_custom_tag_fn - else "" - ) - fut = ex.submit( - _build_with_logging, - log_dir=build_log_dir, - base_image=base, - target_image=image, - custom_tag=resolved_tag, - target=target, - push=push, - max_retries=max_retries, - post_build_fn=post_build_fn, - ) - futures[fut] = base - - _update_pbar( - pbar, - successes, - failures, - len(in_progress), - next(iter(in_progress), None), - "Running", - ) + for batch_idx, batch in enumerate(batches, start=1): + if not batch: + continue - for fut in as_completed(futures): - base = futures[fut] - try: - result: BuildOutput = fut.result() - writer.write(result.model_dump_json() + "\n") - writer.flush() - with mu: - successes += 1 - _update_pbar( - pbar, successes, failures, len(in_progress), base, "✅ Done" + logger.info( + "Starting batch %d/%d (%d images)", batch_idx, total_batches, len(batch) + ) + in_progress: set[str] = set() + + with ProcessPoolExecutor(max_workers=max_workers) as ex: + futures = {} + for base in batch: + in_progress.add(base) + resolved_tag = ( + base_image_to_custom_tag_fn(base) + if base_image_to_custom_tag_fn + else "" ) - except Exception as e: - logger.error("Build failed for %s: %r", base, e) - # Write a failure line to manifest; keep going. - writer.write( - BuildOutput( - base_image=base, tags=[], error=repr(e) - ).model_dump_json() - + "\n" + fut = ex.submit( + _build_with_logging, + log_dir=build_log_dir, + base_image=base, + target_image=image, + custom_tag=resolved_tag, + target=target, + push=push, + max_retries=max_retries, + post_build_fn=post_build_fn, ) + futures[fut] = base + + _update_pbar( + pbar, + successes, + failures, + len(in_progress), + next(iter(in_progress), None), + f"Batch {batch_idx}/{total_batches} running", + ) + + for fut in as_completed(futures): + base = futures[fut] + status = None + try: + result: BuildOutput = fut.result() + except Exception as e: + logger.error("Build failed for %s: %r", base, e) + result = BuildOutput(base_image=base, tags=[], error=repr(e)) + + writer.write(result.model_dump_json() + "\n") writer.flush() + with mu: - failures += 1 - _update_pbar( - pbar, successes, failures, len(in_progress), base, "❌ Failed" - ) - finally: - with mu: - in_progress.discard(base) + if result.error or not result.tags: + failures += 1 + status = "❌ Failed" + else: + successes += 1 + status = "✅ Done" + + in_progress.discard(base) pbar.update(1) _update_pbar( pbar, @@ -525,7 +535,40 @@ def build_all_images( failures, len(in_progress), next(iter(in_progress), None), - None, + status, + ) + + used, total = buildkit_disk_usage() + if total > 0: + logger.info( + "BuildKit usage after batch %d/%d: %.2f%% (%0.2f GiB / %0.2f GiB)", + batch_idx, + total_batches, + (used / total) * 100, + used / (1 << 30), + total / (1 << 30), + ) + + if prune_keep_storage_gb and prune_keep_storage_gb > 0: + pruned = maybe_prune_buildkit_cache( + keep_storage_gb=prune_keep_storage_gb, + threshold_pct=prune_threshold_pct, + filters=prune_filters, + ) + if pruned: + logger.info( + "Pruned BuildKit cache after batch %d/%d (keep=%d GiB, threshold=%.1f%%)", + batch_idx, + total_batches, + prune_keep_storage_gb, + prune_threshold_pct, + ) + else: + logger.info( + "No prune needed after batch %d/%d (threshold %.1f%%)", + batch_idx, + total_batches, + prune_threshold_pct, ) logger.info( "Done. Built=%d Failed=%d Manifest=%s", diff --git a/benchmarks/utils/buildx_utils.py b/benchmarks/utils/buildx_utils.py index 14e8182c..c08bd46e 100644 --- a/benchmarks/utils/buildx_utils.py +++ b/benchmarks/utils/buildx_utils.py @@ -6,6 +6,7 @@ import json import os import re +import shutil import subprocess import time from pathlib import Path @@ -143,3 +144,81 @@ def maybe_reset_buildkit( reset_buildkit("partial", base_image, target_image) else: reset_buildkit("full", base_image, target_image) + + +def buildkit_disk_usage(root: str | Path = "/var/lib/buildkit") -> tuple[int, int]: + """ + Return (used_bytes, total_bytes) for the BuildKit root. Missing path -> (0, 0). + """ + path = Path(root) + try: + usage = shutil.disk_usage(path) + return usage.used, usage.total + except FileNotFoundError: + logger.warning("BuildKit root %s not found when checking disk usage", path) + except Exception as e: + logger.warning("Unable to read disk usage for %s: %s", path, e) + return 0, 0 + + +def prune_buildkit_cache( + keep_storage_gb: int | None = None, + filters: list[str] | None = None, +) -> None: + """ + Run docker buildx prune to free space on the BuildKit cache. + keep_storage_gb: amount of cache to keep (pass None to keep default behavior). + filters: optional list of buildx prune --filter values. + """ + cmd = ["docker", "buildx", "prune", "--all", "--force"] + if keep_storage_gb is not None and keep_storage_gb > 0: + cmd += ["--keep-storage", f"{keep_storage_gb}g"] + if filters: + for f in filters: + cmd += ["--filter", f] + + logger.info("Pruning BuildKit cache: %s", " ".join(cmd)) + proc = subprocess.run(cmd, text=True, capture_output=True) + if proc.stdout: + logger.info(proc.stdout.strip()) + if proc.stderr: + logger.warning(proc.stderr.strip()) + if proc.returncode != 0: + raise RuntimeError( + proc.stderr.strip() + or proc.stdout.strip() + or f"docker buildx prune failed with exit code {proc.returncode}" + ) + + +def maybe_prune_buildkit_cache( + keep_storage_gb: int, + threshold_pct: float, + filters: list[str] | None = None, + root: str | Path = "/var/lib/buildkit", +) -> bool: + """ + Prune cache if disk usage exceeds threshold_pct (0-100). + Returns True if a prune was attempted. + """ + used, total = buildkit_disk_usage(root) + if total <= 0: + logger.warning("Skipping BuildKit prune; unable to determine disk usage.") + return False + + usage_pct = (used / total) * 100 + logger.info( + "BuildKit disk usage: %.2f%% (%0.2f GiB used / %0.2f GiB total)", + usage_pct, + used / (1 << 30), + total / (1 << 30), + ) + if usage_pct < threshold_pct: + return False + + try: + prune_buildkit_cache(keep_storage_gb=keep_storage_gb, filters=filters) + return True + except Exception as e: + logger.warning("Failed to prune BuildKit cache: %s", e) + return False From 47ee57418fef0d7b643e5b7e7b0204561caaf4f9 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 7 Jan 2026 10:16:26 +0100 Subject: [PATCH 2/4] Improve Modal eval stability and mamba handling --- benchmarks/utils/dataset.py | 43 +- benchmarks/utils/modal_patches.py | 784 ++++++++++++++++++++++++ benchmarks/utils/modal_sitecustomize.py | 30 + benchmarks/utils/sitecustomize.py | 167 +---- sitecustomize.py | 5 + 5 files changed, 860 insertions(+), 169 deletions(-) create mode 100644 benchmarks/utils/modal_patches.py create mode 100644 benchmarks/utils/modal_sitecustomize.py diff --git a/benchmarks/utils/dataset.py b/benchmarks/utils/dataset.py index 42d609bf..a60356ca 100644 --- a/benchmarks/utils/dataset.py +++ b/benchmarks/utils/dataset.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os +import time from typing import cast import pandas as pd @@ -24,8 +26,6 @@ def _load_selected_instances(select_file_path: str) -> set[str]: FileNotFoundError: If the select file doesn't exist ValueError: If the file is empty """ - import os - if not os.path.isfile(select_file_path): raise FileNotFoundError(f"Select file not found: {select_file_path}") @@ -72,6 +72,40 @@ def prepare_dataset( return dataset +def _load_hf_dataset_with_retry(dataset_name: str, split: str) -> Dataset: + """Load a Hugging Face dataset with retries and longer HTTP timeouts.""" + # Default HF timeout is ~10s; bump it to reduce transient ReadTimeouts. + os.environ.setdefault("HF_HUB_HTTP_TIMEOUT", "60") + os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", os.environ["HF_HUB_HTTP_TIMEOUT"]) + + attempts = 5 + backoff = 5.0 + last_exc: Exception | None = None + + for attempt in range(1, attempts + 1): + try: + dataset = load_dataset(dataset_name, split=split) + assert isinstance(dataset, Dataset) + return dataset + except Exception as exc: + last_exc = exc + if attempt == attempts: + break + wait = min(backoff, 60.0) + logger.warning( + "load_dataset failed (attempt %s/%s): %s; retrying in %.1fs", + attempt, + attempts, + exc, + wait, + ) + time.sleep(wait) + backoff *= 2 + + assert last_exc is not None + raise last_exc + + def get_dataset( dataset_name: str, split: str, @@ -79,8 +113,6 @@ def get_dataset( selected_instances_file: str | None = None, ) -> pd.DataFrame: """Load and prepare dataset for evaluation.""" - import os - # Check if dataset_name is a local file path if os.path.isfile(dataset_name) and dataset_name.endswith(".jsonl"): # Load local JSONL file @@ -90,8 +122,7 @@ def get_dataset( assert isinstance(df, pd.DataFrame) else: # Load dataset from HuggingFace Hub - dataset = load_dataset(dataset_name, split=split) - assert isinstance(dataset, Dataset) + dataset = _load_hf_dataset_with_retry(dataset_name, split) df = dataset.to_pandas() assert isinstance(df, pd.DataFrame) diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py new file mode 100644 index 00000000..2a614388 --- /dev/null +++ b/benchmarks/utils/modal_patches.py @@ -0,0 +1,784 @@ +""" +Shared Modal patch helpers for host and in-image sitecustomize. +""" + +from __future__ import annotations + +import json +import os +import sys +import time +import traceback + + +_MODAL_SITECUSTOMIZE_INJECTED = False +DEFAULT_AGENT_IMAGE = "ghcr.io/openhands/eval-agent-server" +DEFAULT_BUILD_TARGET = "source-minimal" + + +def _log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def _make_emit(stderr: bool): + if stderr: + + def emit(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + else: + + def emit(message: str) -> None: + print(message) + + return emit + + +def _get_sdk_short_sha() -> str: + """ + Resolve SDK short SHA from the benchmarks repo when available, otherwise + fall back to environment variables for the Modal function image. + """ + try: + from benchmarks.utils.version import SDK_SHORT_SHA as version_sdk_short_sha + + return version_sdk_short_sha + except Exception: + return os.getenv("SDK_SHORT_SHA", "").strip() or "unknown" + + +def _get_agent_server_image_repo() -> str: + return ( + os.getenv("EVAL_AGENT_SERVER_IMAGE", DEFAULT_AGENT_IMAGE).strip() + or DEFAULT_AGENT_IMAGE + ) + + +def _get_build_target() -> str: + return ( + os.getenv("SWEBENCH_IMAGE_TARGET") + or os.getenv("SWEBENCH_BUILD_TARGET") + or DEFAULT_BUILD_TARGET + ) + + +def _get_custom_tag_from_instance_id(instance_id: str) -> str: + try: + repo, name = instance_id.split("__", 1) + except Exception as exc: + raise RuntimeError( + f"Unable to compute SWE-bench image tag; unexpected instance id: {instance_id}" + ) from exc + return f"sweb.eval.x86_64.{repo}_1776_{name}".lower() + + +def _build_prebuilt_image_tag(test_spec) -> str: + instance_id = getattr(test_spec, "instance_id", None) + if not instance_id: + raise RuntimeError("TestSpec missing instance_id; cannot select Modal image") + + sdk_short_sha = _get_sdk_short_sha() + if sdk_short_sha in ("", "unknown", None): + raise RuntimeError( + "SDK short SHA is unavailable. Set SDK_SHORT_SHA or ensure the " + "benchmarks repository has an initialized SDK submodule." + ) + + target = _get_build_target() + suffix = f"-{target}" if target and target != "binary" else "" + custom_tag = _get_custom_tag_from_instance_id(instance_id) + agent_repo = _get_agent_server_image_repo() + return f"{agent_repo}:{sdk_short_sha}-{custom_tag}{suffix}" + + +def _patch_modal_sklearn_install_flag() -> None: + """ + pip>=25 removed `--no-use-pep517`, but the scikit-learn specs still pass it. + When Modal builds the sandbox image, pip fails before tests ever run. Mutate + the specs in-place to drop that flag for all scikit-learn versions. + """ + try: + # The constants module aliases SPECS_SKLEARN into MAP_REPO_VERSION_TO_SPECS, + # so mutating the dict is sufficient as long as imports share the object. + import swebench.harness.constants as consts + import swebench.harness.constants.python as py_consts + except Exception: + return + + for version, spec in py_consts.SPECS_SKLEARN.items(): + install_cmd = spec.get("install", "") + if "--no-use-pep517" not in install_cmd: + continue + + cleaned = " ".join(install_cmd.replace("--no-use-pep517", "").split()) + py_consts.SPECS_SKLEARN[version]["install"] = cleaned + + repo_specs = consts.MAP_REPO_VERSION_TO_SPECS.get("scikit-learn/scikit-learn") + if isinstance(repo_specs, dict): + repo_specs[version] = py_consts.SPECS_SKLEARN[version] + + # Best-effort patch; stay silent if nothing needed or imports fail. + return + + +def _patch_modal_sandbox_cgroup_retry() -> None: + """Retry cgroup writes to avoid transient Modal filesystem errors.""" + try: + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception: + return + + runtime_cls = getattr(mod, "ModalSandboxRuntime", None) + if runtime_cls is None: + return + + original_write_file = runtime_cls.write_file + if getattr(original_write_file, "_benchmarks_retry_patch", False): + return + + try: + from modal.exception import FilesystemExecutionError + except Exception: + FilesystemExecutionError = Exception + + def write_file_with_retry(self, file_path: str, content: str): + target_path = "/sys/fs/cgroup/cpu/cpu.shares" + attempts = 5 + delay = 1.0 + path_str = str(file_path) + for attempt in range(1, attempts + 1): + try: + return original_write_file(self, file_path, content) + except Exception as exc: + if path_str != target_path or not isinstance( + exc, FilesystemExecutionError + ): + raise + if attempt == attempts: + raise + time.sleep(delay) + delay = min(delay * 2, 10.0) + + setattr(write_file_with_retry, "_benchmarks_retry_patch", True) + runtime_cls.write_file = write_file_with_retry + + +def _patch_modal_prebuilt_images( + log_errors: bool = False, stderr: bool = False +) -> None: + """Use prebuilt SWE-Bench images in Modal instead of rebuilding per instance.""" + try: + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception as exc: + if log_errors: + _log( + f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}" + ) + return + + runtime_cls = getattr(mod, "ModalSandboxRuntime", None) + if runtime_cls is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing") + return + + original_get_instance_image = getattr(runtime_cls, "get_instance_image", None) + if original_get_instance_image is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: get_instance_image missing") + return + if getattr(original_get_instance_image, "_benchmarks_prebuilt_patch", False): + return + + emit = _make_emit(stderr) + + def get_instance_image_from_registry(test_spec): + import modal + + instance_id = getattr(test_spec, "instance_id", "unknown") + try: + image_tag = _build_prebuilt_image_tag(test_spec) + except Exception as exc: + emit( + "[benchmarks] Modal image spec failed to compute tag for " + f"{instance_id}: {exc}" + ) + raise + + emit( + "[benchmarks] Modal image spec using prebuilt image " + f"{image_tag} for {instance_id}" + ) + try: + image = modal.Image.from_registry(image_tag) + except Exception as exc: + emit( + "[benchmarks] Failed to load Modal image from registry " + f"{image_tag}: {exc}" + ) + raise + + # Upstream expects /testbed as the working directory when running evals. + return image.workdir("/testbed/") + + setattr(get_instance_image_from_registry, "_benchmarks_prebuilt_patch", True) + runtime_cls.get_instance_image = staticmethod(get_instance_image_from_registry) + if log_errors: + _log("[benchmarks] modal sitecustomize: applied prebuilt image patch") + + +def _patch_modal_sandbox_timing(log_errors: bool = False, stderr: bool = False) -> None: + """Log sandbox creation timing to pinpoint Modal startup delays.""" + try: + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception as exc: + if log_errors: + _log( + f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}" + ) + return + + runtime_cls = getattr(mod, "ModalSandboxRuntime", None) + if runtime_cls is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing") + return + + original_get_sandbox = runtime_cls._get_sandbox + if getattr(original_get_sandbox, "_benchmarks_timing_patch", False): + return + + emit = _make_emit(stderr) + + def get_sandbox_with_timing(self, timeout: int | None = None): + instance_id = getattr( + getattr(self, "test_spec", None), "instance_id", "unknown" + ) + start = time.time() + emit( + f"[benchmarks] Modal sandbox create start for {instance_id} " + f"(timeout={timeout})" + ) + try: + return original_get_sandbox(self, timeout) + finally: + elapsed = time.time() - start + emit( + f"[benchmarks] Modal sandbox create end for {instance_id} " + f"(elapsed={elapsed:.2f}s)" + ) + + setattr(get_sandbox_with_timing, "_benchmarks_timing_patch", True) + runtime_cls._get_sandbox = get_sandbox_with_timing + if log_errors: + _log("[benchmarks] modal sitecustomize: applied sandbox timing patch") + + +def _patch_modal_runtime_debug(log_errors: bool = False, stderr: bool = False) -> None: + """Log Modal runtime init and critical exec timings for debugging.""" + try: + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception as exc: + if log_errors: + _log( + f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}" + ) + return + + runtime_cls = getattr(mod, "ModalSandboxRuntime", None) + if runtime_cls is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing") + return + + emit = _make_emit(stderr) + + original_init = runtime_cls.__init__ + if not getattr(original_init, "_benchmarks_runtime_init_patch", False): + + def init_with_logging( + self, test_spec, timeout: int | None = None, verbose=True + ): + instance_id = getattr(test_spec, "instance_id", "unknown") + emit( + f"[benchmarks] Modal runtime init start for {instance_id} " + f"(timeout={timeout})" + ) + start = time.time() + try: + return original_init(self, test_spec, timeout, verbose) + finally: + elapsed = time.time() - start + emit( + f"[benchmarks] Modal runtime init end for {instance_id} " + f"(elapsed={elapsed:.2f}s)" + ) + + setattr(init_with_logging, "_benchmarks_runtime_init_patch", True) + runtime_cls.__init__ = init_with_logging + + original_exec = runtime_cls.exec + if not getattr(original_exec, "_benchmarks_runtime_exec_patch", False): + + def exec_with_logging(self, command: str): + instance_id = getattr( + getattr(self, "test_spec", None), "instance_id", "unknown" + ) + label = None + if "/root/eval.sh" in command: + label = "eval" + elif "git apply" in command or "patch --batch" in command: + label = "apply_patch" + + if label: + emit(f"[benchmarks] Modal exec start for {instance_id} ({label})") + start = time.time() + output, returncode = original_exec(self, command) + elapsed = time.time() - start + emit( + f"[benchmarks] Modal exec end for {instance_id} ({label}) " + f"(elapsed={elapsed:.2f}s, returncode={returncode})" + ) + return output, returncode + + return original_exec(self, command) + + setattr(exec_with_logging, "_benchmarks_runtime_exec_patch", True) + runtime_cls.exec = exec_with_logging + + if log_errors: + _log("[benchmarks] modal sitecustomize: applied runtime debug patch") + + +def _patch_modal_function_timeout( + timeout_seconds: int = 4 * 60 * 60, log_errors: bool = False +) -> None: + """Raise Modal function timeout and emit per-instance logs in Modal.""" + try: + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception as exc: + if log_errors: + _log( + f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}" + ) + return + + run_fn = getattr(mod, "run_instance_modal", None) + if run_fn is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: run_instance_modal missing") + return + if getattr(run_fn, "_benchmarks_timeout_patch", False): + return + + raw_f = getattr(getattr(run_fn, "info", None), "raw_f", None) + if raw_f is None: + if log_errors: + _log("[benchmarks] modal sitecustomize: run_instance_modal raw_f missing") + return + + image = getattr(getattr(run_fn, "spec", None), "image", None) + if image is None: + image = getattr(mod, "swebench_image", None) + + def run_instance_modal_with_logging(test_spec, pred, run_id, timeout=None): + instance_id = getattr(test_spec, "instance_id", None) or pred.get( + "instance_id", "unknown" + ) + effective_timeout = timeout + if timeout is None or timeout < timeout_seconds: + effective_timeout = timeout_seconds + print( + "[benchmarks] Modal function overriding timeout " + f"instance={instance_id} from {timeout} to {effective_timeout}", + file=sys.stderr, + flush=True, + ) + start = time.time() + print( + "[benchmarks] Modal function start " + f"instance={instance_id} run_id={run_id} timeout={effective_timeout}", + file=sys.stderr, + flush=True, + ) + try: + result = raw_f(test_spec, pred, run_id, effective_timeout) + except Exception as exc: + elapsed = time.time() - start + print( + "[benchmarks] Modal function error " + f"instance={instance_id} elapsed={elapsed:.2f}s error={exc}", + file=sys.stderr, + flush=True, + ) + raise + elapsed = time.time() - start + status = "errored" if getattr(result, "errored", False) else "ok" + print( + "[benchmarks] Modal function end " + f"instance={instance_id} elapsed={elapsed:.2f}s status={status}", + file=sys.stderr, + flush=True, + ) + return result + + try: + patched_fn = mod.app.function( + image=image, + timeout=timeout_seconds, + include_source=True, + serialized=True, + name="run_instance_modal", + )(run_instance_modal_with_logging) + except Exception as exc: + if log_errors: + _log(f"[benchmarks] modal sitecustomize: failed to patch timeout: {exc}") + return + + setattr(patched_fn, "_benchmarks_timeout_patch", True) + mod.run_instance_modal = patched_fn + if log_errors: + _log( + "[benchmarks] modal sitecustomize: patched function timeout " + f"to {timeout_seconds}s" + ) + + +def _inject_modal_sitecustomize() -> None: + """Inject modal_sitecustomize into the Modal function image.""" + global _MODAL_SITECUSTOMIZE_INJECTED + + if _MODAL_SITECUSTOMIZE_INJECTED: + return + + try: + from pathlib import Path + + from swebench.harness.modal_eval import run_evaluation_modal as mod + except Exception: + return + + patch_path = Path(__file__).with_name("modal_sitecustomize.py") + if not patch_path.exists(): + return + + patches_path = Path(__file__).with_name("modal_patches.py") + + run_fn = getattr(mod, "run_instance_modal", None) + if run_fn is None or not hasattr(run_fn, "spec"): + return + + image = run_fn.spec.image + + # Rebuild from the base swebench image so add_local_file mounts (from the + # original function definition) are converted to copies. Modal rejects + # adding build steps after mount layers. + base_image = getattr(mod, "swebench_image", None) + entry_local = getattr(mod, "LOCAL_SANDBOX_ENTRYPOINT_PATH", None) + entry_remote = getattr(mod, "REMOTE_SANDBOX_ENTRYPOINT_PATH", None) + if base_image is not None and entry_local is not None and entry_remote is not None: + image = base_image.add_local_file( + Path(entry_local), + str(entry_remote), + copy=True, + ) + + patched_image = image.add_local_file( + patch_path, + "/root/sitecustomize.py", + copy=True, + ) + + if patches_path.exists(): + patched_image = patched_image.add_local_file( + patches_path, + "/root/modal_patches.py", + copy=True, + ) + + env_vars = {"PYTHONPATH": "/root"} + try: + from benchmarks.utils.version import SDK_SHA, SDK_SHORT_SHA + + env_vars["SDK_SHA"] = SDK_SHA + env_vars["SDK_SHORT_SHA"] = SDK_SHORT_SHA + except Exception: + sdk_sha_env = os.getenv("SDK_SHA") + if sdk_sha_env: + env_vars["SDK_SHA"] = sdk_sha_env + env_vars["SDK_SHORT_SHA"] = _get_sdk_short_sha() + + env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo() + env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target() + + patched_image = patched_image.env(env_vars) + + run_fn.spec.image = patched_image + mod.swebench_image = patched_image + _MODAL_SITECUSTOMIZE_INJECTED = True + _log("benchmarks injected modal sitecustomize into run_instance_modal image") + + +def _patch_run_instances_modal_logging() -> None: + """Persist logs/reports for Modal exceptions before TestOutput is returned.""" + try: + # Import inside the function so this file is harmless for non-SWE-Bench runs. + from swebench.harness.docker_build import setup_logger + from swebench.harness.modal_eval import run_evaluation_modal as mod + from swebench.harness.modal_eval.run_evaluation_modal import ( + TestOutput, + get_log_dir, + ) + from swebench.harness.reporting import make_run_report + from swebench.harness.test_spec.test_spec import make_test_spec + except Exception: + # If swebench isn't installed, bail out quietly. + return + + def run_instances_modal_with_logging( + predictions: dict, + instances: list, + full_dataset: list, + run_id: str, + timeout: int, + ): + """ + Wrap the upstream `run_instances_modal` to persist logs for exceptions. + + If Modal returns an exception (e.g., sandbox creation failure), we now + write run_instance.log + report.json so scoring can surface the error. + """ + test_specs = list(map(make_test_spec, instances)) + max_attempts = 3 + attempt = 0 + backoff = 5.0 + try: + import modal as modal_pkg + + client_closed_exc = getattr( + getattr(modal_pkg, "exception", None), "ClientClosed", None + ) + except Exception: + client_closed_exc = None + + def is_client_closed_error(error: Exception) -> bool: + if client_closed_exc is not None and isinstance(error, client_closed_exc): + return True + return "ClientClosed" in str(error) + + while True: + run_test_specs = [] + + # Skip any instances that already have logs. + for test_spec in test_specs: + log_dir = get_log_dir( + predictions[test_spec.instance_id], + run_id, + test_spec.instance_id, + ) + if log_dir.exists(): + continue + run_test_specs.append(test_spec) + + if not run_test_specs: + break + + attempt += 1 + client_closed_specs = [] + try: + with mod.modal.enable_output(): + with mod.app.run(): + emit = _make_emit(stderr=False) + submit_ids = [spec.instance_id for spec in run_test_specs] + emit( + f"[benchmarks] Modal starmap submit {len(submit_ids)} " + f"instances: {', '.join(submit_ids)}" + ) + starmap_start = time.time() + results = mod.run_instance_modal.starmap( + [ + ( + test_spec, + predictions[test_spec.instance_id], + run_id, + timeout, + ) + for test_spec in run_test_specs + ], + return_exceptions=True, + ) + starmap_elapsed = time.time() - starmap_start + emit( + f"[benchmarks] Modal starmap completed in " + f"{starmap_elapsed:.2f}s" + ) + + for test_spec, result in zip(run_test_specs, results): + pred = predictions[test_spec.instance_id] + log_dir = get_log_dir(pred, run_id, test_spec.instance_id) + log_dir.mkdir(parents=True, exist_ok=True) + + if isinstance(result, TestOutput): + # Normal path: write logs exactly as upstream does. + with open(log_dir / "run_instance.log", "w") as f: + f.write(result.run_instance_log) + with open(log_dir / "test_output.txt", "w") as f: + f.write(result.test_output) + with open(log_dir / "patch.diff", "w") as f: + f.write(result.patch_diff) + if result.report_json_str: + try: + parsed = json.loads(result.report_json_str) + (log_dir / "report.json").write_text( + json.dumps(parsed, indent=4) + ) + except Exception: + # Best-effort write if JSON is malformed. + (log_dir / "report.json").write_text( + result.report_json_str + ) + else: + if is_client_closed_error(result): + client_closed_specs.append((test_spec, result)) + continue + # Exception path: persist a minimal log + report so scoring sees it. + log_file = log_dir / "run_instance.log" + logger = setup_logger( + test_spec.instance_id, log_file, add_stdout=False + ) + logger.error( + "Modal run failed before producing TestOutput: %s", + result, + ) + logger.error( + "Traceback:\n%s", + "".join(traceback.format_exception(result)), + ) + + # Save the attempted patch for debugging. + (log_dir / "patch.diff").write_text( + pred.get("model_patch", "") + ) + + error_msg = f"Modal error: {result}" + report = { + test_spec.instance_id: { + "resolved": False, + "error": error_msg, + } + } + (log_dir / "report.json").write_text( + json.dumps(report, indent=4) + ) + if client_closed_specs: + if attempt < max_attempts: + time.sleep(backoff) + backoff = min(backoff * 2, 60.0) + continue + for test_spec, result in client_closed_specs: + pred = predictions[test_spec.instance_id] + log_dir = get_log_dir(pred, run_id, test_spec.instance_id) + if log_dir.exists(): + continue + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / "run_instance.log" + logger = setup_logger( + test_spec.instance_id, log_file, add_stdout=False + ) + logger.error( + "Modal client closed during image build/sandbox create: %s", + result, + ) + (log_dir / "patch.diff").write_text(pred.get("model_patch", "")) + report = { + test_spec.instance_id: { + "resolved": False, + "error": ( + "Modal client closed during image build/sandbox " + f"create: {result}" + ), + } + } + (log_dir / "report.json").write_text( + json.dumps(report, indent=4) + ) + break + except Exception as exc: + is_client_closed = is_client_closed_error(exc) + + if is_client_closed and attempt < max_attempts: + time.sleep(backoff) + backoff = min(backoff * 2, 60.0) + continue + + if is_client_closed: + for test_spec in run_test_specs: + pred = predictions[test_spec.instance_id] + log_dir = get_log_dir(pred, run_id, test_spec.instance_id) + if log_dir.exists(): + continue + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / "run_instance.log" + logger = setup_logger( + test_spec.instance_id, log_file, add_stdout=False + ) + logger.error( + "Modal client closed during image build/sandbox create: %s", + exc, + ) + (log_dir / "patch.diff").write_text(pred.get("model_patch", "")) + report = { + test_spec.instance_id: { + "resolved": False, + "error": f"Modal client closed: {exc}", + } + } + (log_dir / "report.json").write_text( + json.dumps(report, indent=4) + ) + break + + raise + + # Always build the aggregate report (upstream behavior). + make_run_report(predictions, full_dataset, run_id) + + # Apply the monkey patch once per interpreter. + mod.run_instances_modal = run_instances_modal_with_logging + try: + # run_evaluation imports run_instances_modal by value, so update it too. + import swebench.harness.run_evaluation as run_eval_mod + + run_eval_mod.run_instances_modal = run_instances_modal_with_logging + except Exception: + # If run_evaluation isn't available yet, skip—sitecustomize will have + # already patched the modal module itself. + pass + try: + # modal_eval re-exports run_instances_modal; update the package export too. + import swebench.harness.modal_eval as modal_eval_pkg + + modal_eval_pkg.run_instances_modal = run_instances_modal_with_logging + except Exception: + # Keep best-effort behavior if the package import fails. + pass + + +def apply_host_patches() -> None: + _patch_modal_sklearn_install_flag() + _patch_modal_sandbox_cgroup_retry() + _patch_modal_prebuilt_images() + # Inject sitecustomize before re-registering the Modal function so the + # patched image (with env + sitecustomize) is baked into the function spec. + _inject_modal_sitecustomize() + _patch_modal_sandbox_timing(log_errors=True, stderr=True) + _patch_modal_runtime_debug(log_errors=True, stderr=True) + _patch_modal_function_timeout(log_errors=True) + _patch_run_instances_modal_logging() + + +def apply_image_patches() -> None: + _log("[benchmarks] modal sitecustomize imported") + _patch_modal_prebuilt_images(log_errors=True, stderr=True) + _patch_modal_sandbox_timing(log_errors=True, stderr=True) + _patch_modal_runtime_debug(log_errors=True, stderr=True) diff --git a/benchmarks/utils/modal_sitecustomize.py b/benchmarks/utils/modal_sitecustomize.py new file mode 100644 index 00000000..522ceb68 --- /dev/null +++ b/benchmarks/utils/modal_sitecustomize.py @@ -0,0 +1,30 @@ +""" +Sitecustomize injected into the Modal function image for SWE-bench runs. + +This file is copied into the Modal function container and imported automatically +by Python (via sitecustomize) to patch the modal_eval runtime with prebuilt image +selection plus extra timing/logging hooks. +""" + +from __future__ import annotations + +import sys + + +def _apply_modal_image_patch() -> None: + try: + from benchmarks.utils import modal_patches + except Exception: + try: + import modal_patches + except Exception as exc: + print( + f"[benchmarks] modal sitecustomize: failed to import modal_patches: {exc}", + file=sys.stderr, + flush=True, + ) + return + modal_patches.apply_image_patches() + + +_apply_modal_image_patch() diff --git a/benchmarks/utils/sitecustomize.py b/benchmarks/utils/sitecustomize.py index 71e7305b..de4dd483 100644 --- a/benchmarks/utils/sitecustomize.py +++ b/benchmarks/utils/sitecustomize.py @@ -4,8 +4,8 @@ When running SWE-Bench evaluation on Modal, we want to capture exceptions that happen before a `report.json` is written (e.g., sandbox creation failures). The upstream harness only prints these exceptions, so the scoring step sees missing -logs and marks the instance as a generic error. This module monkey-patches -`run_instances_modal` to persist a minimal log/report for any exception result. +logs and marks the instance as a generic error. This module installs patches to +persist a minimal log/report for any exception result. We also patch the scikit-learn install command used inside Modal sandboxes to drop the deprecated `--no-use-pep517` flag (removed in pip>=25). That flag @@ -18,170 +18,11 @@ from __future__ import annotations -import json -import traceback - - -def _patch_modal_sklearn_install_flag() -> None: - """ - pip>=25 removed `--no-use-pep517`, but the scikit-learn specs still pass it. - When Modal builds the sandbox image, pip fails before tests ever run. Mutate - the specs in-place to drop that flag for all scikit-learn versions. - """ - try: - # The constants module aliases SPECS_SKLEARN into MAP_REPO_VERSION_TO_SPECS, - # so mutating the dict is sufficient as long as imports share the object. - import swebench.harness.constants as consts - import swebench.harness.constants.python as py_consts - except Exception: - return - - for version, spec in py_consts.SPECS_SKLEARN.items(): - install_cmd = spec.get("install", "") - if "--no-use-pep517" not in install_cmd: - continue - - cleaned = " ".join(install_cmd.replace("--no-use-pep517", "").split()) - py_consts.SPECS_SKLEARN[version]["install"] = cleaned - - repo_specs = consts.MAP_REPO_VERSION_TO_SPECS.get("scikit-learn/scikit-learn") - if isinstance(repo_specs, dict): - repo_specs[version] = py_consts.SPECS_SKLEARN[version] - - # Best-effort patch; stay silent if nothing needed or imports fail. - return +from benchmarks.utils import modal_patches def _apply_modal_logging_patch() -> None: - _patch_modal_sklearn_install_flag() - - try: - # Import inside the function so this file is harmless for non-SWE-Bench runs. - from swebench.harness.docker_build import setup_logger - from swebench.harness.modal_eval import run_evaluation_modal as mod - from swebench.harness.modal_eval.run_evaluation_modal import ( - TestOutput, - get_log_dir, - ) - from swebench.harness.reporting import make_run_report - from swebench.harness.test_spec.test_spec import make_test_spec - except Exception: - # If swebench isn't installed, bail out quietly. - return - - def run_instances_modal_with_logging( - predictions: dict, - instances: list, - full_dataset: list, - run_id: str, - timeout: int, - ): - """ - Wrap the upstream `run_instances_modal` to persist logs for exceptions. - - If Modal returns an exception (e.g., sandbox creation failure), we now - write run_instance.log + report.json so scoring can surface the error. - """ - test_specs = list(map(make_test_spec, instances)) - - with mod.modal.enable_output(): - with mod.app.run(): - run_test_specs = [] - - # Skip any instances that already have logs. - for test_spec in test_specs: - log_dir = get_log_dir( - predictions[test_spec.instance_id], - run_id, - test_spec.instance_id, - ) - if log_dir.exists(): - continue - run_test_specs.append(test_spec) - - if run_test_specs: - results = mod.run_instance_modal.starmap( - [ - ( - test_spec, - predictions[test_spec.instance_id], - run_id, - timeout, - ) - for test_spec in run_test_specs - ], - return_exceptions=True, - ) - - for test_spec, result in zip(run_test_specs, results): - pred = predictions[test_spec.instance_id] - log_dir = get_log_dir(pred, run_id, test_spec.instance_id) - log_dir.mkdir(parents=True, exist_ok=True) - - if isinstance(result, TestOutput): - # Normal path: write logs exactly as upstream does. - with open(log_dir / "run_instance.log", "w") as f: - f.write(result.run_instance_log) - with open(log_dir / "test_output.txt", "w") as f: - f.write(result.test_output) - with open(log_dir / "patch.diff", "w") as f: - f.write(result.patch_diff) - if result.report_json_str: - try: - parsed = json.loads(result.report_json_str) - (log_dir / "report.json").write_text( - json.dumps(parsed, indent=4) - ) - except Exception: - # Best-effort write if JSON is malformed. - (log_dir / "report.json").write_text( - result.report_json_str - ) - else: - # Exception path: persist a minimal log + report so scoring sees it. - log_file = log_dir / "run_instance.log" - logger = setup_logger( - test_spec.instance_id, log_file, add_stdout=False - ) - logger.error( - "Modal run failed before producing TestOutput: %s", - result, - ) - logger.error( - "Traceback:\n%s", - "".join(traceback.format_exception(result)), - ) - - # Save the attempted patch for debugging. - (log_dir / "patch.diff").write_text( - pred.get("model_patch", "") - ) - - error_msg = f"Modal error: {result}" - report = { - test_spec.instance_id: { - "resolved": False, - "error": error_msg, - } - } - (log_dir / "report.json").write_text( - json.dumps(report, indent=4) - ) - - # Always build the aggregate report (upstream behavior). - make_run_report(predictions, full_dataset, run_id) - - # Apply the monkey patch once per interpreter. - mod.run_instances_modal = run_instances_modal_with_logging - try: - # run_evaluation imports run_instances_modal by value, so update it too. - import swebench.harness.run_evaluation as run_eval_mod - - run_eval_mod.run_instances_modal = run_instances_modal_with_logging - except Exception: - # If run_evaluation isn't available yet, skip—sitecustomize will have - # already patched the modal module itself. - pass + modal_patches.apply_host_patches() _apply_modal_logging_patch() diff --git a/sitecustomize.py b/sitecustomize.py index 987dbb91..50338d1a 100644 --- a/sitecustomize.py +++ b/sitecustomize.py @@ -6,6 +6,11 @@ this file at the repo root guarantees the patch runs before swebench is used. """ +import sys + + +print("benchmarks sitecustomize imported", file=sys.stderr, flush=True) + try: # Reuse the actual patch logic that lives alongside the benchmarks package. from benchmarks.utils.sitecustomize import _apply_modal_logging_patch From b1c500e2fb577cf0fffcabe83d0e5ac918a34597 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 7 Jan 2026 12:09:07 +0100 Subject: [PATCH 3/4] Improve swebench build robustness --- .github/workflows/build-swebench-images.yml | 18 ++++++++++++++---- benchmarks/utils/build_utils.py | 11 ++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index 7c491383..da802683 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -188,9 +188,9 @@ jobs: - name: "Preflight: prune cache and verify BuildKit disk" run: | set -euo pipefail - KEEP_GB=450 - echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=24h)..." - docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=24h || true + KEEP_GB=120 + echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=12h)..." + docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=12h || true if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then LINE=$(tail -n1 /tmp/buildkit_df) @@ -297,7 +297,17 @@ jobs: if [ "$FAILURES" -gt 0 ]; then echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" - cat $MANIFEST_FILES | python -c 'import sys,json; [print(f"- `{d.get(\"base_image\",\"unknown\")}`: {d.get(\"error\",\"No tags generated\")}") for d in map(json.loads, sys.stdin) if d.get("error") or not d.get("tags")]' >> "$GITHUB_STEP_SUMMARY" + cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" +import json +import sys + +for line in sys.stdin: + data = json.loads(line.strip()) + if data.get("error") or not data.get("tags"): + base = data.get("base_image", "unknown") + err = data.get("error") or "No tags generated" + print(f"- `{base}`: {err}") +PY fi if [ "$FAILURES" -gt 0 ]; then diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index 520500fe..62cb7e79 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -6,6 +6,7 @@ import argparse import contextlib import io +import os import subprocess import sys import time @@ -443,11 +444,11 @@ def build_all_images( failures = 0 mu = Lock() - # Fixed batch/prune settings (hardcoded to avoid extra CLI surface) - batch_size = 50 - prune_keep_storage_gb = 450 - prune_threshold_pct = 85.0 - prune_filters: list[str] | None = ["unused-for=24h"] + # Batch/prune settings (tunable via env to control disk usage on sticky runners) + batch_size = int(os.getenv("BUILD_BATCH_SIZE", "25")) + prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "120")) + prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "70")) + prune_filters: list[str] | None = ["unused-for=12h"] def _chunks(seq: list[str], size: int): if size <= 0: From 1af454086fe271959469ebc9a2724a7ad2183f8d Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Wed, 7 Jan 2026 12:20:00 +0100 Subject: [PATCH 4/4] Fix swebench summary script indentation --- .github/workflows/build-swebench-images.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index da802683..2816c902 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -298,16 +298,16 @@ jobs: echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY" echo "" >> "$GITHUB_STEP_SUMMARY" cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY" -import json -import sys - -for line in sys.stdin: - data = json.loads(line.strip()) - if data.get("error") or not data.get("tags"): - base = data.get("base_image", "unknown") - err = data.get("error") or "No tags generated" - print(f"- `{base}`: {err}") -PY + import json + import sys + + for line in sys.stdin: + data = json.loads(line.strip()) + if data.get("error") or not data.get("tags"): + base = data.get("base_image", "unknown") + err = data.get("error") or "No tags generated" + print(f"- `{base}`: {err}") + PY fi if [ "$FAILURES" -gt 0 ]; then