From 2b03b6383609f2cf68f5bd2984b52802af6063a1 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 7 Jan 2026 10:28:16 +0100
Subject: [PATCH 1/4] Stabilize SWE-bench buildx with batched pruning and
 fail-fast

---
 .github/workflows/build-swebench-images.yml |  41 ++++-
 benchmarks/utils/build_utils.py             | 179 ++++++++++++--------
 benchmarks/utils/buildx_utils.py            |  79 +++++++++
 3 files changed, 223 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
index 78fc2a3b..7c491383 100644
--- a/.github/workflows/build-swebench-images.yml
+++ b/.github/workflows/build-swebench-images.yml
@@ -185,6 +185,33 @@ jobs:
         run: |
           make build
 
+      - name: "Preflight: prune cache and verify BuildKit disk"
+        run: |
+          set -euo pipefail
+          KEEP_GB=450
+          echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=24h)..."
+          docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=24h || true
+
+          if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
+            LINE=$(tail -n1 /tmp/buildkit_df)
+            TOTAL=$(echo "$LINE" | awk '{print $2}')
+            USED=$(echo "$LINE" | awk '{print $3}')
+            FREE=$(echo "$LINE" | awk '{print $4}')
+            if [ -n "$TOTAL" ] && [ -n "$FREE" ]; then
+              PCT=$(( 100 * USED / TOTAL ))
+              echo "BuildKit disk: used ${USED} / ${TOTAL} bytes (${PCT}%); free ${FREE} bytes"
+              MIN=$((75 * 1024 * 1024 * 1024))
+              if [ "$FREE" -lt "$MIN" ]; then
+                echo "::error::Not enough free space on /var/lib/buildkit (${FREE} bytes free, need >= ${MIN})"
+                exit 1
+              fi
+            else
+              echo "Warning: unable to parse df output for /var/lib/buildkit"
+            fi
+          else
+            echo "Warning: /var/lib/buildkit not found; skipping disk check"
+          fi
+
       - name: Build and push SWE-Bench images
         run: |
           set -euo pipefail
@@ -270,14 +297,12 @@ jobs:
           if [ "$FAILURES" -gt 0 ]; then
             echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
             echo "" >> "$GITHUB_STEP_SUMMARY"
-            cat $MANIFEST_FILES | python -c "
-          import sys
-          import json
-          for line in sys.stdin:
-              data = json.loads(line.strip())
-              if data.get('error') is not None or len(data.get('tags', [])) == 0:
-                  print(f\"- \\\`{data.get('base_image', 'unknown')}\\\`: {data.get('error', 'No tags generated')}\")
-          " >> "$GITHUB_STEP_SUMMARY"
+            cat $MANIFEST_FILES | python -c 'import sys,json; [print(f"- `{d.get(\"base_image\",\"unknown\")}`: {d.get(\"error\",\"No tags generated\")}") for d in map(json.loads, sys.stdin) if d.get("error") or not d.get("tags")]' >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+          if [ "$FAILURES" -gt 0 ]; then
+            echo "::error::Detected $FAILURES failed or missing agent-server images out of $TOTAL"
+            exit 1
           fi
 
       - name: Comment on tracker issue
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index dbca8ced..520500fe 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -20,9 +20,12 @@
 from tqdm.auto import tqdm
 
 from benchmarks.utils.args_parser import get_parser
-from benchmarks.utils.buildx_utils import maybe_reset_buildkit
+from benchmarks.utils.buildx_utils import (
+    buildkit_disk_usage,
+    maybe_prune_buildkit_cache,
+    maybe_reset_buildkit,
+)
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
-from benchmarks.utils.image_utils import image_exists
 from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
@@ -293,11 +296,6 @@ def build_image(
         git_sha=git_sha,
         sdk_version=sdk_version,
     )
-    for t in opts.all_tags:
-        # Check if image exists or not
-        if image_exists(t):
-            logger.info("Image %s already exists. Skipping build.", t)
-            return BuildOutput(base_image=base_image, tags=[t], error=None)
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)
 
@@ -443,9 +441,24 @@ def build_all_images(
 
     successes = 0
     failures = 0
-    in_progress: set[str] = set()
     mu = Lock()
 
+    # Fixed batch/prune settings (hardcoded to avoid extra CLI surface)
+    batch_size = 50
+    prune_keep_storage_gb = 450
+    prune_threshold_pct = 85.0
+    prune_filters: list[str] | None = ["unused-for=24h"]
+
+    def _chunks(seq: list[str], size: int):
+        if size <= 0:
+            yield seq
+            return
+        for i in range(0, len(seq), size):
+            yield seq[i : i + size]
+
+    batches = list(_chunks(base_images, batch_size or len(base_images)))
+    total_batches = len(batches)
+
     with (
         manifest_file.open("w") as writer,
         tqdm(
@@ -454,70 +467,67 @@ def build_all_images(
     ):
         _update_pbar(pbar, successes, failures, 0, None, "Queueing")
 
-        # Single unified path: ProcessPoolExecutor( max_workers = args.max_workers ),
-        # even if it's 1. Using processes instead of threads ensures proper isolation
-        # of stdout/stderr and logging handlers, preventing output mixing between builds.
-        with ProcessPoolExecutor(max_workers=max_workers) as ex:
-            futures = {}
-            for base in base_images:
-                in_progress.add(base)
-                # Resolve custom tags before scheduling to avoid pickling issues with closures.
-                resolved_tag = (
-                    base_image_to_custom_tag_fn(base)
-                    if base_image_to_custom_tag_fn
-                    else ""
-                )
-                fut = ex.submit(
-                    _build_with_logging,
-                    log_dir=build_log_dir,
-                    base_image=base,
-                    target_image=image,
-                    custom_tag=resolved_tag,
-                    target=target,
-                    push=push,
-                    max_retries=max_retries,
-                    post_build_fn=post_build_fn,
-                )
-                futures[fut] = base
-
-            _update_pbar(
-                pbar,
-                successes,
-                failures,
-                len(in_progress),
-                next(iter(in_progress), None),
-                "Running",
-            )
+        for batch_idx, batch in enumerate(batches, start=1):
+            if not batch:
+                continue
 
-            for fut in as_completed(futures):
-                base = futures[fut]
-                try:
-                    result: BuildOutput = fut.result()
-                    writer.write(result.model_dump_json() + "\n")
-                    writer.flush()
-                    with mu:
-                        successes += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "✅ Done"
+            logger.info(
+                "Starting batch %d/%d (%d images)", batch_idx, total_batches, len(batch)
+            )
+            in_progress: set[str] = set()
+
+            with ProcessPoolExecutor(max_workers=max_workers) as ex:
+                futures = {}
+                for base in batch:
+                    in_progress.add(base)
+                    resolved_tag = (
+                        base_image_to_custom_tag_fn(base)
+                        if base_image_to_custom_tag_fn
+                        else ""
                     )
-                except Exception as e:
-                    logger.error("Build failed for %s: %r", base, e)
-                    # Write a failure line to manifest; keep going.
-                    writer.write(
-                        BuildOutput(
-                            base_image=base, tags=[], error=repr(e)
-                        ).model_dump_json()
-                        + "\n"
+                    fut = ex.submit(
+                        _build_with_logging,
+                        log_dir=build_log_dir,
+                        base_image=base,
+                        target_image=image,
+                        custom_tag=resolved_tag,
+                        target=target,
+                        push=push,
+                        max_retries=max_retries,
+                        post_build_fn=post_build_fn,
                     )
+                    futures[fut] = base
+
+                _update_pbar(
+                    pbar,
+                    successes,
+                    failures,
+                    len(in_progress),
+                    next(iter(in_progress), None),
+                    f"Batch {batch_idx}/{total_batches} running",
+                )
+
+                for fut in as_completed(futures):
+                    base = futures[fut]
+                    status = None
+                    try:
+                        result: BuildOutput = fut.result()
+                    except Exception as e:
+                        logger.error("Build failed for %s: %r", base, e)
+                        result = BuildOutput(base_image=base, tags=[], error=repr(e))
+
+                    writer.write(result.model_dump_json() + "\n")
                     writer.flush()
+
                     with mu:
-                        failures += 1
-                    _update_pbar(
-                        pbar, successes, failures, len(in_progress), base, "❌ Failed"
-                    )
-                finally:
-                    with mu:
-                        in_progress.discard(base)
+                        if result.error or not result.tags:
+                            failures += 1
+                            status = "❌ Failed"
+                        else:
+                            successes += 1
+                            status = "✅ Done"
+
+                    in_progress.discard(base)
                     pbar.update(1)
                     _update_pbar(
                         pbar,
@@ -525,7 +535,40 @@ def build_all_images(
                         failures,
                         len(in_progress),
                         next(iter(in_progress), None),
-                        None,
+                        status,
+                    )
+
+            used, total = buildkit_disk_usage()
+            if total > 0:
+                logger.info(
+                    "BuildKit usage after batch %d/%d: %.2f%% (%0.2f GiB / %0.2f GiB)",
+                    batch_idx,
+                    total_batches,
+                    (used / total) * 100,
+                    used / (1 << 30),
+                    total / (1 << 30),
+                )
+
+            if prune_keep_storage_gb and prune_keep_storage_gb > 0:
+                pruned = maybe_prune_buildkit_cache(
+                    keep_storage_gb=prune_keep_storage_gb,
+                    threshold_pct=prune_threshold_pct,
+                    filters=prune_filters,
+                )
+                if pruned:
+                    logger.info(
+                        "Pruned BuildKit cache after batch %d/%d (keep=%d GiB, threshold=%.1f%%)",
+                        batch_idx,
+                        total_batches,
+                        prune_keep_storage_gb,
+                        prune_threshold_pct,
+                    )
+                else:
+                    logger.info(
+                        "No prune needed after batch %d/%d (threshold %.1f%%)",
+                        batch_idx,
+                        total_batches,
+                        prune_threshold_pct,
                     )
     logger.info(
         "Done. Built=%d  Failed=%d  Manifest=%s",
diff --git a/benchmarks/utils/buildx_utils.py b/benchmarks/utils/buildx_utils.py
index 14e8182c..c08bd46e 100644
--- a/benchmarks/utils/buildx_utils.py
+++ b/benchmarks/utils/buildx_utils.py
@@ -6,6 +6,7 @@
 import json
 import os
 import re
+import shutil
 import subprocess
 import time
 from pathlib import Path
@@ -143,3 +144,81 @@ def maybe_reset_buildkit(
         reset_buildkit("partial", base_image, target_image)
     else:
         reset_buildkit("full", base_image, target_image)
+
+
+def buildkit_disk_usage(root: str | Path = "/var/lib/buildkit") -> tuple[int, int]:
+    """
+    Return (used_bytes, total_bytes) for the BuildKit root. Missing path -> (0, 0).
+    """
+    path = Path(root)
+    try:
+        usage = shutil.disk_usage(path)
+        return usage.used, usage.total
+    except FileNotFoundError:
+        logger.warning("BuildKit root %s not found when checking disk usage", path)
+    except Exception as e:
+        logger.warning("Unable to read disk usage for %s: %s", path, e)
+    return 0, 0
+
+
+def prune_buildkit_cache(
+    keep_storage_gb: int | None = None,
+    filters: list[str] | None = None,
+) -> None:
+    """
+    Run docker buildx prune to free space on the BuildKit cache.
+    keep_storage_gb: amount of cache to keep (pass None to keep default behavior).
+    filters: optional list of buildx prune --filter values.
+    """
+    cmd = ["docker", "buildx", "prune", "--all", "--force"]
+    if keep_storage_gb is not None and keep_storage_gb > 0:
+        cmd += ["--keep-storage", f"{keep_storage_gb}g"]
+    if filters:
+        for f in filters:
+            cmd += ["--filter", f]
+
+    logger.info("Pruning BuildKit cache: %s", " ".join(cmd))
+    proc = subprocess.run(cmd, text=True, capture_output=True)
+    if proc.stdout:
+        logger.info(proc.stdout.strip())
+    if proc.stderr:
+        logger.warning(proc.stderr.strip())
+    if proc.returncode != 0:
+        raise RuntimeError(
+            proc.stderr.strip()
+            or proc.stdout.strip()
+            or f"docker buildx prune failed with exit code {proc.returncode}"
+        )
+
+
+def maybe_prune_buildkit_cache(
+    keep_storage_gb: int,
+    threshold_pct: float,
+    filters: list[str] | None = None,
+    root: str | Path = "/var/lib/buildkit",
+) -> bool:
+    """
+    Prune cache if disk usage exceeds threshold_pct (0-100).
+    Returns True if a prune was attempted.
+    """
+    used, total = buildkit_disk_usage(root)
+    if total <= 0:
+        logger.warning("Skipping BuildKit prune; unable to determine disk usage.")
+        return False
+
+    usage_pct = (used / total) * 100
+    logger.info(
+        "BuildKit disk usage: %.2f%% (%0.2f GiB used / %0.2f GiB total)",
+        usage_pct,
+        used / (1 << 30),
+        total / (1 << 30),
+    )
+    if usage_pct < threshold_pct:
+        return False
+
+    try:
+        prune_buildkit_cache(keep_storage_gb=keep_storage_gb, filters=filters)
+        return True
+    except Exception as e:
+        logger.warning("Failed to prune BuildKit cache: %s", e)
+        return False

From 47ee57418fef0d7b643e5b7e7b0204561caaf4f9 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 7 Jan 2026 10:16:26 +0100
Subject: [PATCH 2/4] Improve Modal eval stability and mamba handling

---
 benchmarks/utils/dataset.py             |  43 +-
 benchmarks/utils/modal_patches.py       | 784 ++++++++++++++++++++++++
 benchmarks/utils/modal_sitecustomize.py |  30 +
 benchmarks/utils/sitecustomize.py       | 167 +----
 sitecustomize.py                        |   5 +
 5 files changed, 860 insertions(+), 169 deletions(-)
 create mode 100644 benchmarks/utils/modal_patches.py
 create mode 100644 benchmarks/utils/modal_sitecustomize.py

diff --git a/benchmarks/utils/dataset.py b/benchmarks/utils/dataset.py
index 42d609bf..a60356ca 100644
--- a/benchmarks/utils/dataset.py
+++ b/benchmarks/utils/dataset.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import os
+import time
 from typing import cast
 
 import pandas as pd
@@ -24,8 +26,6 @@ def _load_selected_instances(select_file_path: str) -> set[str]:
         FileNotFoundError: If the select file doesn't exist
         ValueError: If the file is empty
     """
-    import os
-
     if not os.path.isfile(select_file_path):
         raise FileNotFoundError(f"Select file not found: {select_file_path}")
 
@@ -72,6 +72,40 @@ def prepare_dataset(
     return dataset
 
 
+def _load_hf_dataset_with_retry(dataset_name: str, split: str) -> Dataset:
+    """Load a Hugging Face dataset with retries and longer HTTP timeouts."""
+    # Default HF timeout is ~10s; bump it to reduce transient ReadTimeouts.
+    os.environ.setdefault("HF_HUB_HTTP_TIMEOUT", "60")
+    os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", os.environ["HF_HUB_HTTP_TIMEOUT"])
+
+    attempts = 5
+    backoff = 5.0
+    last_exc: Exception | None = None
+
+    for attempt in range(1, attempts + 1):
+        try:
+            dataset = load_dataset(dataset_name, split=split)
+            assert isinstance(dataset, Dataset)
+            return dataset
+        except Exception as exc:
+            last_exc = exc
+            if attempt == attempts:
+                break
+            wait = min(backoff, 60.0)
+            logger.warning(
+                "load_dataset failed (attempt %s/%s): %s; retrying in %.1fs",
+                attempt,
+                attempts,
+                exc,
+                wait,
+            )
+            time.sleep(wait)
+            backoff *= 2
+
+    assert last_exc is not None
+    raise last_exc
+
+
 def get_dataset(
     dataset_name: str,
     split: str,
@@ -79,8 +113,6 @@ def get_dataset(
     selected_instances_file: str | None = None,
 ) -> pd.DataFrame:
     """Load and prepare dataset for evaluation."""
-    import os
-
     # Check if dataset_name is a local file path
     if os.path.isfile(dataset_name) and dataset_name.endswith(".jsonl"):
         # Load local JSONL file
@@ -90,8 +122,7 @@ def get_dataset(
         assert isinstance(df, pd.DataFrame)
     else:
         # Load dataset from HuggingFace Hub
-        dataset = load_dataset(dataset_name, split=split)
-        assert isinstance(dataset, Dataset)
+        dataset = _load_hf_dataset_with_retry(dataset_name, split)
         df = dataset.to_pandas()
         assert isinstance(df, pd.DataFrame)
 
diff --git a/benchmarks/utils/modal_patches.py b/benchmarks/utils/modal_patches.py
new file mode 100644
index 00000000..2a614388
--- /dev/null
+++ b/benchmarks/utils/modal_patches.py
@@ -0,0 +1,784 @@
+"""
+Shared Modal patch helpers for host and in-image sitecustomize.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import time
+import traceback
+
+
+_MODAL_SITECUSTOMIZE_INJECTED = False
+DEFAULT_AGENT_IMAGE = "ghcr.io/openhands/eval-agent-server"
+DEFAULT_BUILD_TARGET = "source-minimal"
+
+
+def _log(message: str) -> None:
+    print(message, file=sys.stderr, flush=True)
+
+
+def _make_emit(stderr: bool):
+    if stderr:
+
+        def emit(message: str) -> None:
+            print(message, file=sys.stderr, flush=True)
+
+    else:
+
+        def emit(message: str) -> None:
+            print(message)
+
+    return emit
+
+
+def _get_sdk_short_sha() -> str:
+    """
+    Resolve SDK short SHA from the benchmarks repo when available, otherwise
+    fall back to environment variables for the Modal function image.
+    """
+    try:
+        from benchmarks.utils.version import SDK_SHORT_SHA as version_sdk_short_sha
+
+        return version_sdk_short_sha
+    except Exception:
+        return os.getenv("SDK_SHORT_SHA", "").strip() or "unknown"
+
+
+def _get_agent_server_image_repo() -> str:
+    return (
+        os.getenv("EVAL_AGENT_SERVER_IMAGE", DEFAULT_AGENT_IMAGE).strip()
+        or DEFAULT_AGENT_IMAGE
+    )
+
+
+def _get_build_target() -> str:
+    return (
+        os.getenv("SWEBENCH_IMAGE_TARGET")
+        or os.getenv("SWEBENCH_BUILD_TARGET")
+        or DEFAULT_BUILD_TARGET
+    )
+
+
+def _get_custom_tag_from_instance_id(instance_id: str) -> str:
+    try:
+        repo, name = instance_id.split("__", 1)
+    except Exception as exc:
+        raise RuntimeError(
+            f"Unable to compute SWE-bench image tag; unexpected instance id: {instance_id}"
+        ) from exc
+    return f"sweb.eval.x86_64.{repo}_1776_{name}".lower()
+
+
+def _build_prebuilt_image_tag(test_spec) -> str:
+    instance_id = getattr(test_spec, "instance_id", None)
+    if not instance_id:
+        raise RuntimeError("TestSpec missing instance_id; cannot select Modal image")
+
+    sdk_short_sha = _get_sdk_short_sha()
+    if sdk_short_sha in ("", "unknown", None):
+        raise RuntimeError(
+            "SDK short SHA is unavailable. Set SDK_SHORT_SHA or ensure the "
+            "benchmarks repository has an initialized SDK submodule."
+        )
+
+    target = _get_build_target()
+    suffix = f"-{target}" if target and target != "binary" else ""
+    custom_tag = _get_custom_tag_from_instance_id(instance_id)
+    agent_repo = _get_agent_server_image_repo()
+    return f"{agent_repo}:{sdk_short_sha}-{custom_tag}{suffix}"
+
+
+def _patch_modal_sklearn_install_flag() -> None:
+    """
+    pip>=25 removed `--no-use-pep517`, but the scikit-learn specs still pass it.
+    When Modal builds the sandbox image, pip fails before tests ever run. Mutate
+    the specs in-place to drop that flag for all scikit-learn versions.
+    """
+    try:
+        # The constants module aliases SPECS_SKLEARN into MAP_REPO_VERSION_TO_SPECS,
+        # so mutating the dict is sufficient as long as imports share the object.
+        import swebench.harness.constants as consts
+        import swebench.harness.constants.python as py_consts
+    except Exception:
+        return
+
+    for version, spec in py_consts.SPECS_SKLEARN.items():
+        install_cmd = spec.get("install", "")
+        if "--no-use-pep517" not in install_cmd:
+            continue
+
+        cleaned = " ".join(install_cmd.replace("--no-use-pep517", "").split())
+        py_consts.SPECS_SKLEARN[version]["install"] = cleaned
+
+        repo_specs = consts.MAP_REPO_VERSION_TO_SPECS.get("scikit-learn/scikit-learn")
+        if isinstance(repo_specs, dict):
+            repo_specs[version] = py_consts.SPECS_SKLEARN[version]
+
+    # Best-effort patch; stay silent if nothing needed or imports fail.
+    return
+
+
+def _patch_modal_sandbox_cgroup_retry() -> None:
+    """Retry cgroup writes to avoid transient Modal filesystem errors."""
+    try:
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception:
+        return
+
+    runtime_cls = getattr(mod, "ModalSandboxRuntime", None)
+    if runtime_cls is None:
+        return
+
+    original_write_file = runtime_cls.write_file
+    if getattr(original_write_file, "_benchmarks_retry_patch", False):
+        return
+
+    try:
+        from modal.exception import FilesystemExecutionError
+    except Exception:
+        FilesystemExecutionError = Exception
+
+    def write_file_with_retry(self, file_path: str, content: str):
+        target_path = "/sys/fs/cgroup/cpu/cpu.shares"
+        attempts = 5
+        delay = 1.0
+        path_str = str(file_path)
+        for attempt in range(1, attempts + 1):
+            try:
+                return original_write_file(self, file_path, content)
+            except Exception as exc:
+                if path_str != target_path or not isinstance(
+                    exc, FilesystemExecutionError
+                ):
+                    raise
+                if attempt == attempts:
+                    raise
+                time.sleep(delay)
+                delay = min(delay * 2, 10.0)
+
+    setattr(write_file_with_retry, "_benchmarks_retry_patch", True)
+    runtime_cls.write_file = write_file_with_retry
+
+
+def _patch_modal_prebuilt_images(
+    log_errors: bool = False, stderr: bool = False
+) -> None:
+    """Use prebuilt SWE-Bench images in Modal instead of rebuilding per instance."""
+    try:
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception as exc:
+        if log_errors:
+            _log(
+                f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}"
+            )
+        return
+
+    runtime_cls = getattr(mod, "ModalSandboxRuntime", None)
+    if runtime_cls is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing")
+        return
+
+    original_get_instance_image = getattr(runtime_cls, "get_instance_image", None)
+    if original_get_instance_image is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: get_instance_image missing")
+        return
+    if getattr(original_get_instance_image, "_benchmarks_prebuilt_patch", False):
+        return
+
+    emit = _make_emit(stderr)
+
+    def get_instance_image_from_registry(test_spec):
+        import modal
+
+        instance_id = getattr(test_spec, "instance_id", "unknown")
+        try:
+            image_tag = _build_prebuilt_image_tag(test_spec)
+        except Exception as exc:
+            emit(
+                "[benchmarks] Modal image spec failed to compute tag for "
+                f"{instance_id}: {exc}"
+            )
+            raise
+
+        emit(
+            "[benchmarks] Modal image spec using prebuilt image "
+            f"{image_tag} for {instance_id}"
+        )
+        try:
+            image = modal.Image.from_registry(image_tag)
+        except Exception as exc:
+            emit(
+                "[benchmarks] Failed to load Modal image from registry "
+                f"{image_tag}: {exc}"
+            )
+            raise
+
+        # Upstream expects /testbed as the working directory when running evals.
+        return image.workdir("/testbed/")
+
+    setattr(get_instance_image_from_registry, "_benchmarks_prebuilt_patch", True)
+    runtime_cls.get_instance_image = staticmethod(get_instance_image_from_registry)
+    if log_errors:
+        _log("[benchmarks] modal sitecustomize: applied prebuilt image patch")
+
+
+def _patch_modal_sandbox_timing(log_errors: bool = False, stderr: bool = False) -> None:
+    """Log sandbox creation timing to pinpoint Modal startup delays."""
+    try:
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception as exc:
+        if log_errors:
+            _log(
+                f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}"
+            )
+        return
+
+    runtime_cls = getattr(mod, "ModalSandboxRuntime", None)
+    if runtime_cls is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing")
+        return
+
+    original_get_sandbox = runtime_cls._get_sandbox
+    if getattr(original_get_sandbox, "_benchmarks_timing_patch", False):
+        return
+
+    emit = _make_emit(stderr)
+
+    def get_sandbox_with_timing(self, timeout: int | None = None):
+        instance_id = getattr(
+            getattr(self, "test_spec", None), "instance_id", "unknown"
+        )
+        start = time.time()
+        emit(
+            f"[benchmarks] Modal sandbox create start for {instance_id} "
+            f"(timeout={timeout})"
+        )
+        try:
+            return original_get_sandbox(self, timeout)
+        finally:
+            elapsed = time.time() - start
+            emit(
+                f"[benchmarks] Modal sandbox create end for {instance_id} "
+                f"(elapsed={elapsed:.2f}s)"
+            )
+
+    setattr(get_sandbox_with_timing, "_benchmarks_timing_patch", True)
+    runtime_cls._get_sandbox = get_sandbox_with_timing
+    if log_errors:
+        _log("[benchmarks] modal sitecustomize: applied sandbox timing patch")
+
+
+def _patch_modal_runtime_debug(log_errors: bool = False, stderr: bool = False) -> None:
+    """Log Modal runtime init and critical exec timings for debugging."""
+    try:
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception as exc:
+        if log_errors:
+            _log(
+                f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}"
+            )
+        return
+
+    runtime_cls = getattr(mod, "ModalSandboxRuntime", None)
+    if runtime_cls is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: ModalSandboxRuntime missing")
+        return
+
+    emit = _make_emit(stderr)
+
+    original_init = runtime_cls.__init__
+    if not getattr(original_init, "_benchmarks_runtime_init_patch", False):
+
+        def init_with_logging(
+            self, test_spec, timeout: int | None = None, verbose=True
+        ):
+            instance_id = getattr(test_spec, "instance_id", "unknown")
+            emit(
+                f"[benchmarks] Modal runtime init start for {instance_id} "
+                f"(timeout={timeout})"
+            )
+            start = time.time()
+            try:
+                return original_init(self, test_spec, timeout, verbose)
+            finally:
+                elapsed = time.time() - start
+                emit(
+                    f"[benchmarks] Modal runtime init end for {instance_id} "
+                    f"(elapsed={elapsed:.2f}s)"
+                )
+
+        setattr(init_with_logging, "_benchmarks_runtime_init_patch", True)
+        runtime_cls.__init__ = init_with_logging
+
+    original_exec = runtime_cls.exec
+    if not getattr(original_exec, "_benchmarks_runtime_exec_patch", False):
+
+        def exec_with_logging(self, command: str):
+            instance_id = getattr(
+                getattr(self, "test_spec", None), "instance_id", "unknown"
+            )
+            label = None
+            if "/root/eval.sh" in command:
+                label = "eval"
+            elif "git apply" in command or "patch --batch" in command:
+                label = "apply_patch"
+
+            if label:
+                emit(f"[benchmarks] Modal exec start for {instance_id} ({label})")
+                start = time.time()
+                output, returncode = original_exec(self, command)
+                elapsed = time.time() - start
+                emit(
+                    f"[benchmarks] Modal exec end for {instance_id} ({label}) "
+                    f"(elapsed={elapsed:.2f}s, returncode={returncode})"
+                )
+                return output, returncode
+
+            return original_exec(self, command)
+
+        setattr(exec_with_logging, "_benchmarks_runtime_exec_patch", True)
+        runtime_cls.exec = exec_with_logging
+
+    if log_errors:
+        _log("[benchmarks] modal sitecustomize: applied runtime debug patch")
+
+
+def _patch_modal_function_timeout(
+    timeout_seconds: int = 4 * 60 * 60, log_errors: bool = False
+) -> None:
+    """Raise Modal function timeout and emit per-instance logs in Modal."""
+    try:
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception as exc:
+        if log_errors:
+            _log(
+                f"[benchmarks] modal sitecustomize: failed to import modal_eval: {exc}"
+            )
+        return
+
+    run_fn = getattr(mod, "run_instance_modal", None)
+    if run_fn is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: run_instance_modal missing")
+        return
+    if getattr(run_fn, "_benchmarks_timeout_patch", False):
+        return
+
+    raw_f = getattr(getattr(run_fn, "info", None), "raw_f", None)
+    if raw_f is None:
+        if log_errors:
+            _log("[benchmarks] modal sitecustomize: run_instance_modal raw_f missing")
+        return
+
+    image = getattr(getattr(run_fn, "spec", None), "image", None)
+    if image is None:
+        image = getattr(mod, "swebench_image", None)
+
+    def run_instance_modal_with_logging(test_spec, pred, run_id, timeout=None):
+        instance_id = getattr(test_spec, "instance_id", None) or pred.get(
+            "instance_id", "unknown"
+        )
+        effective_timeout = timeout
+        if timeout is None or timeout < timeout_seconds:
+            effective_timeout = timeout_seconds
+            print(
+                "[benchmarks] Modal function overriding timeout "
+                f"instance={instance_id} from {timeout} to {effective_timeout}",
+                file=sys.stderr,
+                flush=True,
+            )
+        start = time.time()
+        print(
+            "[benchmarks] Modal function start "
+            f"instance={instance_id} run_id={run_id} timeout={effective_timeout}",
+            file=sys.stderr,
+            flush=True,
+        )
+        try:
+            result = raw_f(test_spec, pred, run_id, effective_timeout)
+        except Exception as exc:
+            elapsed = time.time() - start
+            print(
+                "[benchmarks] Modal function error "
+                f"instance={instance_id} elapsed={elapsed:.2f}s error={exc}",
+                file=sys.stderr,
+                flush=True,
+            )
+            raise
+        elapsed = time.time() - start
+        status = "errored" if getattr(result, "errored", False) else "ok"
+        print(
+            "[benchmarks] Modal function end "
+            f"instance={instance_id} elapsed={elapsed:.2f}s status={status}",
+            file=sys.stderr,
+            flush=True,
+        )
+        return result
+
+    try:
+        patched_fn = mod.app.function(
+            image=image,
+            timeout=timeout_seconds,
+            include_source=True,
+            serialized=True,
+            name="run_instance_modal",
+        )(run_instance_modal_with_logging)
+    except Exception as exc:
+        if log_errors:
+            _log(f"[benchmarks] modal sitecustomize: failed to patch timeout: {exc}")
+        return
+
+    setattr(patched_fn, "_benchmarks_timeout_patch", True)
+    mod.run_instance_modal = patched_fn
+    if log_errors:
+        _log(
+            "[benchmarks] modal sitecustomize: patched function timeout "
+            f"to {timeout_seconds}s"
+        )
+
+
+def _inject_modal_sitecustomize() -> None:
+    """Inject modal_sitecustomize into the Modal function image."""
+    global _MODAL_SITECUSTOMIZE_INJECTED
+
+    if _MODAL_SITECUSTOMIZE_INJECTED:
+        return
+
+    try:
+        from pathlib import Path
+
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+    except Exception:
+        return
+
+    patch_path = Path(__file__).with_name("modal_sitecustomize.py")
+    if not patch_path.exists():
+        return
+
+    patches_path = Path(__file__).with_name("modal_patches.py")
+
+    run_fn = getattr(mod, "run_instance_modal", None)
+    if run_fn is None or not hasattr(run_fn, "spec"):
+        return
+
+    image = run_fn.spec.image
+
+    # Rebuild from the base swebench image so add_local_file mounts (from the
+    # original function definition) are converted to copies. Modal rejects
+    # adding build steps after mount layers.
+    base_image = getattr(mod, "swebench_image", None)
+    entry_local = getattr(mod, "LOCAL_SANDBOX_ENTRYPOINT_PATH", None)
+    entry_remote = getattr(mod, "REMOTE_SANDBOX_ENTRYPOINT_PATH", None)
+    if base_image is not None and entry_local is not None and entry_remote is not None:
+        image = base_image.add_local_file(
+            Path(entry_local),
+            str(entry_remote),
+            copy=True,
+        )
+
+    patched_image = image.add_local_file(
+        patch_path,
+        "/root/sitecustomize.py",
+        copy=True,
+    )
+
+    if patches_path.exists():
+        patched_image = patched_image.add_local_file(
+            patches_path,
+            "/root/modal_patches.py",
+            copy=True,
+        )
+
+    env_vars = {"PYTHONPATH": "/root"}
+    try:
+        from benchmarks.utils.version import SDK_SHA, SDK_SHORT_SHA
+
+        env_vars["SDK_SHA"] = SDK_SHA
+        env_vars["SDK_SHORT_SHA"] = SDK_SHORT_SHA
+    except Exception:
+        sdk_sha_env = os.getenv("SDK_SHA")
+        if sdk_sha_env:
+            env_vars["SDK_SHA"] = sdk_sha_env
+        env_vars["SDK_SHORT_SHA"] = _get_sdk_short_sha()
+
+    env_vars["EVAL_AGENT_SERVER_IMAGE"] = _get_agent_server_image_repo()
+    env_vars["SWEBENCH_IMAGE_TARGET"] = _get_build_target()
+
+    patched_image = patched_image.env(env_vars)
+
+    run_fn.spec.image = patched_image
+    mod.swebench_image = patched_image
+    _MODAL_SITECUSTOMIZE_INJECTED = True
+    _log("benchmarks injected modal sitecustomize into run_instance_modal image")
+
+
+def _patch_run_instances_modal_logging() -> None:
+    """Persist logs/reports for Modal exceptions before TestOutput is returned."""
+    try:
+        # Import inside the function so this file is harmless for non-SWE-Bench runs.
+        from swebench.harness.docker_build import setup_logger
+        from swebench.harness.modal_eval import run_evaluation_modal as mod
+        from swebench.harness.modal_eval.run_evaluation_modal import (
+            TestOutput,
+            get_log_dir,
+        )
+        from swebench.harness.reporting import make_run_report
+        from swebench.harness.test_spec.test_spec import make_test_spec
+    except Exception:
+        # If swebench isn't installed, bail out quietly.
+        return
+
+    def run_instances_modal_with_logging(
+        predictions: dict,
+        instances: list,
+        full_dataset: list,
+        run_id: str,
+        timeout: int,
+    ):
+        """
+        Wrap the upstream `run_instances_modal` to persist logs for exceptions.
+
+        If Modal returns an exception (e.g., sandbox creation failure), we now
+        write run_instance.log + report.json so scoring can surface the error.
+        """
+        test_specs = list(map(make_test_spec, instances))
+        max_attempts = 3
+        attempt = 0
+        backoff = 5.0
+        try:
+            import modal as modal_pkg
+
+            client_closed_exc = getattr(
+                getattr(modal_pkg, "exception", None), "ClientClosed", None
+            )
+        except Exception:
+            client_closed_exc = None
+
+        def is_client_closed_error(error: Exception) -> bool:
+            if client_closed_exc is not None and isinstance(error, client_closed_exc):
+                return True
+            return "ClientClosed" in str(error)
+
+        while True:
+            run_test_specs = []
+
+            # Skip any instances that already have logs.
+            for test_spec in test_specs:
+                log_dir = get_log_dir(
+                    predictions[test_spec.instance_id],
+                    run_id,
+                    test_spec.instance_id,
+                )
+                if log_dir.exists():
+                    continue
+                run_test_specs.append(test_spec)
+
+            if not run_test_specs:
+                break
+
+            attempt += 1
+            client_closed_specs = []
+            try:
+                with mod.modal.enable_output():
+                    with mod.app.run():
+                        emit = _make_emit(stderr=False)
+                        submit_ids = [spec.instance_id for spec in run_test_specs]
+                        emit(
+                            f"[benchmarks] Modal starmap submit {len(submit_ids)} "
+                            f"instances: {', '.join(submit_ids)}"
+                        )
+                        starmap_start = time.time()
+                        results = mod.run_instance_modal.starmap(
+                            [
+                                (
+                                    test_spec,
+                                    predictions[test_spec.instance_id],
+                                    run_id,
+                                    timeout,
+                                )
+                                for test_spec in run_test_specs
+                            ],
+                            return_exceptions=True,
+                        )
+                        starmap_elapsed = time.time() - starmap_start
+                        emit(
+                            f"[benchmarks] Modal starmap completed in "
+                            f"{starmap_elapsed:.2f}s"
+                        )
+
+                        for test_spec, result in zip(run_test_specs, results):
+                            pred = predictions[test_spec.instance_id]
+                            log_dir = get_log_dir(pred, run_id, test_spec.instance_id)
+                            log_dir.mkdir(parents=True, exist_ok=True)
+
+                            if isinstance(result, TestOutput):
+                                # Normal path: write logs exactly as upstream does.
+                                with open(log_dir / "run_instance.log", "w") as f:
+                                    f.write(result.run_instance_log)
+                                with open(log_dir / "test_output.txt", "w") as f:
+                                    f.write(result.test_output)
+                                with open(log_dir / "patch.diff", "w") as f:
+                                    f.write(result.patch_diff)
+                                if result.report_json_str:
+                                    try:
+                                        parsed = json.loads(result.report_json_str)
+                                        (log_dir / "report.json").write_text(
+                                            json.dumps(parsed, indent=4)
+                                        )
+                                    except Exception:
+                                        # Best-effort write if JSON is malformed.
+                                        (log_dir / "report.json").write_text(
+                                            result.report_json_str
+                                        )
+                            else:
+                                if is_client_closed_error(result):
+                                    client_closed_specs.append((test_spec, result))
+                                    continue
+                                # Exception path: persist a minimal log + report so scoring sees it.
+                                log_file = log_dir / "run_instance.log"
+                                logger = setup_logger(
+                                    test_spec.instance_id, log_file, add_stdout=False
+                                )
+                                logger.error(
+                                    "Modal run failed before producing TestOutput: %s",
+                                    result,
+                                )
+                                logger.error(
+                                    "Traceback:\n%s",
+                                    "".join(traceback.format_exception(result)),
+                                )
+
+                                # Save the attempted patch for debugging.
+                                (log_dir / "patch.diff").write_text(
+                                    pred.get("model_patch", "")
+                                )
+
+                                error_msg = f"Modal error: {result}"
+                                report = {
+                                    test_spec.instance_id: {
+                                        "resolved": False,
+                                        "error": error_msg,
+                                    }
+                                }
+                                (log_dir / "report.json").write_text(
+                                    json.dumps(report, indent=4)
+                                )
+                if client_closed_specs:
+                    if attempt < max_attempts:
+                        time.sleep(backoff)
+                        backoff = min(backoff * 2, 60.0)
+                        continue
+                    for test_spec, result in client_closed_specs:
+                        pred = predictions[test_spec.instance_id]
+                        log_dir = get_log_dir(pred, run_id, test_spec.instance_id)
+                        if log_dir.exists():
+                            continue
+                        log_dir.mkdir(parents=True, exist_ok=True)
+                        log_file = log_dir / "run_instance.log"
+                        logger = setup_logger(
+                            test_spec.instance_id, log_file, add_stdout=False
+                        )
+                        logger.error(
+                            "Modal client closed during image build/sandbox create: %s",
+                            result,
+                        )
+                        (log_dir / "patch.diff").write_text(pred.get("model_patch", ""))
+                        report = {
+                            test_spec.instance_id: {
+                                "resolved": False,
+                                "error": (
+                                    "Modal client closed during image build/sandbox "
+                                    f"create: {result}"
+                                ),
+                            }
+                        }
+                        (log_dir / "report.json").write_text(
+                            json.dumps(report, indent=4)
+                        )
+                    break
+            except Exception as exc:
+                is_client_closed = is_client_closed_error(exc)
+
+                if is_client_closed and attempt < max_attempts:
+                    time.sleep(backoff)
+                    backoff = min(backoff * 2, 60.0)
+                    continue
+
+                if is_client_closed:
+                    for test_spec in run_test_specs:
+                        pred = predictions[test_spec.instance_id]
+                        log_dir = get_log_dir(pred, run_id, test_spec.instance_id)
+                        if log_dir.exists():
+                            continue
+                        log_dir.mkdir(parents=True, exist_ok=True)
+                        log_file = log_dir / "run_instance.log"
+                        logger = setup_logger(
+                            test_spec.instance_id, log_file, add_stdout=False
+                        )
+                        logger.error(
+                            "Modal client closed during image build/sandbox create: %s",
+                            exc,
+                        )
+                        (log_dir / "patch.diff").write_text(pred.get("model_patch", ""))
+                        report = {
+                            test_spec.instance_id: {
+                                "resolved": False,
+                                "error": f"Modal client closed: {exc}",
+                            }
+                        }
+                        (log_dir / "report.json").write_text(
+                            json.dumps(report, indent=4)
+                        )
+                    break
+
+                raise
+
+        # Always build the aggregate report (upstream behavior).
+        make_run_report(predictions, full_dataset, run_id)
+
+    # Apply the monkey patch once per interpreter.
+    mod.run_instances_modal = run_instances_modal_with_logging
+    try:
+        # run_evaluation imports run_instances_modal by value, so update it too.
+        import swebench.harness.run_evaluation as run_eval_mod
+
+        run_eval_mod.run_instances_modal = run_instances_modal_with_logging
+    except Exception:
+        # If run_evaluation isn't available yet, skip—sitecustomize will have
+        # already patched the modal module itself.
+        pass
+    try:
+        # modal_eval re-exports run_instances_modal; update the package export too.
+        import swebench.harness.modal_eval as modal_eval_pkg
+
+        modal_eval_pkg.run_instances_modal = run_instances_modal_with_logging
+    except Exception:
+        # Keep best-effort behavior if the package import fails.
+        pass
+
+
+def apply_host_patches() -> None:
+    _patch_modal_sklearn_install_flag()
+    _patch_modal_sandbox_cgroup_retry()
+    _patch_modal_prebuilt_images()
+    # Inject sitecustomize before re-registering the Modal function so the
+    # patched image (with env + sitecustomize) is baked into the function spec.
+    _inject_modal_sitecustomize()
+    _patch_modal_sandbox_timing(log_errors=True, stderr=True)
+    _patch_modal_runtime_debug(log_errors=True, stderr=True)
+    _patch_modal_function_timeout(log_errors=True)
+    _patch_run_instances_modal_logging()
+
+
+def apply_image_patches() -> None:
+    _log("[benchmarks] modal sitecustomize imported")
+    _patch_modal_prebuilt_images(log_errors=True, stderr=True)
+    _patch_modal_sandbox_timing(log_errors=True, stderr=True)
+    _patch_modal_runtime_debug(log_errors=True, stderr=True)
diff --git a/benchmarks/utils/modal_sitecustomize.py b/benchmarks/utils/modal_sitecustomize.py
new file mode 100644
index 00000000..522ceb68
--- /dev/null
+++ b/benchmarks/utils/modal_sitecustomize.py
@@ -0,0 +1,30 @@
+"""
+Sitecustomize injected into the Modal function image for SWE-bench runs.
+
+This file is copied into the Modal function container and imported automatically
+by Python (via sitecustomize) to patch the modal_eval runtime with prebuilt image
+selection plus extra timing/logging hooks.
+"""
+
+from __future__ import annotations
+
+import sys
+
+
+def _apply_modal_image_patch() -> None:
+    try:
+        from benchmarks.utils import modal_patches
+    except Exception:
+        try:
+            import modal_patches
+        except Exception as exc:
+            print(
+                f"[benchmarks] modal sitecustomize: failed to import modal_patches: {exc}",
+                file=sys.stderr,
+                flush=True,
+            )
+            return
+    modal_patches.apply_image_patches()
+
+
+_apply_modal_image_patch()
diff --git a/benchmarks/utils/sitecustomize.py b/benchmarks/utils/sitecustomize.py
index 71e7305b..de4dd483 100644
--- a/benchmarks/utils/sitecustomize.py
+++ b/benchmarks/utils/sitecustomize.py
@@ -4,8 +4,8 @@
 When running SWE-Bench evaluation on Modal, we want to capture exceptions that
 happen before a `report.json` is written (e.g., sandbox creation failures). The
 upstream harness only prints these exceptions, so the scoring step sees missing
-logs and marks the instance as a generic error. This module monkey-patches
-`run_instances_modal` to persist a minimal log/report for any exception result.
+logs and marks the instance as a generic error. This module installs patches to
+persist a minimal log/report for any exception result.
 
 We also patch the scikit-learn install command used inside Modal sandboxes to
 drop the deprecated `--no-use-pep517` flag (removed in pip>=25). That flag
@@ -18,170 +18,11 @@
 
 from __future__ import annotations
 
-import json
-import traceback
-
-
-def _patch_modal_sklearn_install_flag() -> None:
-    """
-    pip>=25 removed `--no-use-pep517`, but the scikit-learn specs still pass it.
-    When Modal builds the sandbox image, pip fails before tests ever run. Mutate
-    the specs in-place to drop that flag for all scikit-learn versions.
-    """
-    try:
-        # The constants module aliases SPECS_SKLEARN into MAP_REPO_VERSION_TO_SPECS,
-        # so mutating the dict is sufficient as long as imports share the object.
-        import swebench.harness.constants as consts
-        import swebench.harness.constants.python as py_consts
-    except Exception:
-        return
-
-    for version, spec in py_consts.SPECS_SKLEARN.items():
-        install_cmd = spec.get("install", "")
-        if "--no-use-pep517" not in install_cmd:
-            continue
-
-        cleaned = " ".join(install_cmd.replace("--no-use-pep517", "").split())
-        py_consts.SPECS_SKLEARN[version]["install"] = cleaned
-
-        repo_specs = consts.MAP_REPO_VERSION_TO_SPECS.get("scikit-learn/scikit-learn")
-        if isinstance(repo_specs, dict):
-            repo_specs[version] = py_consts.SPECS_SKLEARN[version]
-
-    # Best-effort patch; stay silent if nothing needed or imports fail.
-    return
+from benchmarks.utils import modal_patches
 
 
 def _apply_modal_logging_patch() -> None:
-    _patch_modal_sklearn_install_flag()
-
-    try:
-        # Import inside the function so this file is harmless for non-SWE-Bench runs.
-        from swebench.harness.docker_build import setup_logger
-        from swebench.harness.modal_eval import run_evaluation_modal as mod
-        from swebench.harness.modal_eval.run_evaluation_modal import (
-            TestOutput,
-            get_log_dir,
-        )
-        from swebench.harness.reporting import make_run_report
-        from swebench.harness.test_spec.test_spec import make_test_spec
-    except Exception:
-        # If swebench isn't installed, bail out quietly.
-        return
-
-    def run_instances_modal_with_logging(
-        predictions: dict,
-        instances: list,
-        full_dataset: list,
-        run_id: str,
-        timeout: int,
-    ):
-        """
-        Wrap the upstream `run_instances_modal` to persist logs for exceptions.
-
-        If Modal returns an exception (e.g., sandbox creation failure), we now
-        write run_instance.log + report.json so scoring can surface the error.
-        """
-        test_specs = list(map(make_test_spec, instances))
-
-        with mod.modal.enable_output():
-            with mod.app.run():
-                run_test_specs = []
-
-                # Skip any instances that already have logs.
-                for test_spec in test_specs:
-                    log_dir = get_log_dir(
-                        predictions[test_spec.instance_id],
-                        run_id,
-                        test_spec.instance_id,
-                    )
-                    if log_dir.exists():
-                        continue
-                    run_test_specs.append(test_spec)
-
-                if run_test_specs:
-                    results = mod.run_instance_modal.starmap(
-                        [
-                            (
-                                test_spec,
-                                predictions[test_spec.instance_id],
-                                run_id,
-                                timeout,
-                            )
-                            for test_spec in run_test_specs
-                        ],
-                        return_exceptions=True,
-                    )
-
-                    for test_spec, result in zip(run_test_specs, results):
-                        pred = predictions[test_spec.instance_id]
-                        log_dir = get_log_dir(pred, run_id, test_spec.instance_id)
-                        log_dir.mkdir(parents=True, exist_ok=True)
-
-                        if isinstance(result, TestOutput):
-                            # Normal path: write logs exactly as upstream does.
-                            with open(log_dir / "run_instance.log", "w") as f:
-                                f.write(result.run_instance_log)
-                            with open(log_dir / "test_output.txt", "w") as f:
-                                f.write(result.test_output)
-                            with open(log_dir / "patch.diff", "w") as f:
-                                f.write(result.patch_diff)
-                            if result.report_json_str:
-                                try:
-                                    parsed = json.loads(result.report_json_str)
-                                    (log_dir / "report.json").write_text(
-                                        json.dumps(parsed, indent=4)
-                                    )
-                                except Exception:
-                                    # Best-effort write if JSON is malformed.
-                                    (log_dir / "report.json").write_text(
-                                        result.report_json_str
-                                    )
-                        else:
-                            # Exception path: persist a minimal log + report so scoring sees it.
-                            log_file = log_dir / "run_instance.log"
-                            logger = setup_logger(
-                                test_spec.instance_id, log_file, add_stdout=False
-                            )
-                            logger.error(
-                                "Modal run failed before producing TestOutput: %s",
-                                result,
-                            )
-                            logger.error(
-                                "Traceback:\n%s",
-                                "".join(traceback.format_exception(result)),
-                            )
-
-                            # Save the attempted patch for debugging.
-                            (log_dir / "patch.diff").write_text(
-                                pred.get("model_patch", "")
-                            )
-
-                            error_msg = f"Modal error: {result}"
-                            report = {
-                                test_spec.instance_id: {
-                                    "resolved": False,
-                                    "error": error_msg,
-                                }
-                            }
-                            (log_dir / "report.json").write_text(
-                                json.dumps(report, indent=4)
-                            )
-
-        # Always build the aggregate report (upstream behavior).
-        make_run_report(predictions, full_dataset, run_id)
-
-    # Apply the monkey patch once per interpreter.
-    mod.run_instances_modal = run_instances_modal_with_logging
-    try:
-        # run_evaluation imports run_instances_modal by value, so update it too.
-        import swebench.harness.run_evaluation as run_eval_mod
-
-        run_eval_mod.run_instances_modal = run_instances_modal_with_logging
-    except Exception:
-        # If run_evaluation isn't available yet, skip—sitecustomize will have
-        # already patched the modal module itself.
-        pass
+    modal_patches.apply_host_patches()
 
 
 _apply_modal_logging_patch()
diff --git a/sitecustomize.py b/sitecustomize.py
index 987dbb91..50338d1a 100644
--- a/sitecustomize.py
+++ b/sitecustomize.py
@@ -6,6 +6,11 @@
 this file at the repo root guarantees the patch runs before swebench is used.
 """
 
+import sys
+
+
+print("benchmarks sitecustomize imported", file=sys.stderr, flush=True)
+
 try:
     # Reuse the actual patch logic that lives alongside the benchmarks package.
     from benchmarks.utils.sitecustomize import _apply_modal_logging_patch

From b1c500e2fb577cf0fffcabe83d0e5ac918a34597 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 7 Jan 2026 12:09:07 +0100
Subject: [PATCH 3/4] Improve swebench build robustness

---
 .github/workflows/build-swebench-images.yml | 18 ++++++++++++++----
 benchmarks/utils/build_utils.py             | 11 ++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
index 7c491383..da802683 100644
--- a/.github/workflows/build-swebench-images.yml
+++ b/.github/workflows/build-swebench-images.yml
@@ -188,9 +188,9 @@ jobs:
       - name: "Preflight: prune cache and verify BuildKit disk"
         run: |
           set -euo pipefail
-          KEEP_GB=450
-          echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=24h)..."
-          docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=24h || true
+          KEEP_GB=120
+          echo "Pruning BuildKit cache (keep ${KEEP_GB} GiB, filter unused-for=12h)..."
+          docker buildx prune --all --force --keep-storage ${KEEP_GB}g --filter unused-for=12h || true
 
           if df -B1 /var/lib/buildkit > /tmp/buildkit_df 2>/dev/null; then
             LINE=$(tail -n1 /tmp/buildkit_df)
@@ -297,7 +297,17 @@ jobs:
           if [ "$FAILURES" -gt 0 ]; then
             echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
             echo "" >> "$GITHUB_STEP_SUMMARY"
-            cat $MANIFEST_FILES | python -c 'import sys,json; [print(f"- `{d.get(\"base_image\",\"unknown\")}`: {d.get(\"error\",\"No tags generated\")}") for d in map(json.loads, sys.stdin) if d.get("error") or not d.get("tags")]' >> "$GITHUB_STEP_SUMMARY"
+            cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
+import json
+import sys
+
+for line in sys.stdin:
+    data = json.loads(line.strip())
+    if data.get("error") or not data.get("tags"):
+        base = data.get("base_image", "unknown")
+        err = data.get("error") or "No tags generated"
+        print(f"- `{base}`: {err}")
+PY
           fi
 
           if [ "$FAILURES" -gt 0 ]; then
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index 520500fe..62cb7e79 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -6,6 +6,7 @@
 import argparse
 import contextlib
 import io
+import os
 import subprocess
 import sys
 import time
@@ -443,11 +444,11 @@ def build_all_images(
     failures = 0
     mu = Lock()
 
-    # Fixed batch/prune settings (hardcoded to avoid extra CLI surface)
-    batch_size = 50
-    prune_keep_storage_gb = 450
-    prune_threshold_pct = 85.0
-    prune_filters: list[str] | None = ["unused-for=24h"]
+    # Batch/prune settings (tunable via env to control disk usage on sticky runners)
+    batch_size = int(os.getenv("BUILD_BATCH_SIZE", "25"))
+    prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "120"))
+    prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "70"))
+    prune_filters: list[str] | None = ["unused-for=12h"]
 
     def _chunks(seq: list[str], size: int):
         if size <= 0:

From 1af454086fe271959469ebc9a2724a7ad2183f8d Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 7 Jan 2026 12:20:00 +0100
Subject: [PATCH 4/4] Fix swebench summary script indentation

---
 .github/workflows/build-swebench-images.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
index da802683..2816c902 100644
--- a/.github/workflows/build-swebench-images.yml
+++ b/.github/workflows/build-swebench-images.yml
@@ -298,16 +298,16 @@ jobs:
             echo "### Failed Builds" >> "$GITHUB_STEP_SUMMARY"
             echo "" >> "$GITHUB_STEP_SUMMARY"
             cat $MANIFEST_FILES | python3 - <<'PY' >> "$GITHUB_STEP_SUMMARY"
-import json
-import sys
-
-for line in sys.stdin:
-    data = json.loads(line.strip())
-    if data.get("error") or not data.get("tags"):
-        base = data.get("base_image", "unknown")
-        err = data.get("error") or "No tags generated"
-        print(f"- `{base}`: {err}")
-PY
+          import json
+          import sys
+
+          for line in sys.stdin:
+              data = json.loads(line.strip())
+              if data.get("error") or not data.get("tags"):
+                  base = data.get("base_image", "unknown")
+                  err = data.get("error") or "No tags generated"
+                  print(f"- `{base}`: {err}")
+          PY
           fi
 
           if [ "$FAILURES" -gt 0 ]; then