OpenHands · juanmichelini · Dec 23, 2025
diff --git a/benchmarks/utils/output_utils.py b/benchmarks/utils/output_utils.py
@@ -0,0 +1,58 @@
+"""Utilities for processing and enriching output files."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def add_resolve_rate_to_predictions(
+    predictions_path: str | Path,
+    report_path: str | Path,
+) -> None:
+    """
+    Add resolution status from a report to each prediction in a JSONL file.
+
+    For each prediction in the predictions file, if the instance_id is found in
+    the report's resolved_ids or unresolved_ids, a "report" field is added with
+    {"resolved": true/false}. Predictions not found in either list are left unchanged.
+
+    Args:
+        predictions_path: Path to the predictions JSONL file. Each line should be
+            a JSON object with an "instance_id" field.
+        report_path: Path to the report JSON file containing "resolved_ids" and
+            "unresolved_ids" lists.
+    """
+    predictions_path = Path(predictions_path)
+    report_path = Path(report_path)
+
+    # Load the report
+    with open(report_path, "r") as f:
+        report = json.load(f)
+
+    resolved_ids = set(report.get("resolved_ids", []))
+    unresolved_ids = set(report.get("unresolved_ids", []))
+
+    # Read all predictions
+    predictions = []
+    with open(predictions_path, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                predictions.append(json.loads(line))
+
+    # Update predictions with resolution status
+    updated_predictions = []
+    for prediction in predictions:
+        instance_id = prediction.get("instance_id")
+        if instance_id in resolved_ids:
+            prediction["report"] = {"resolved": True}
+        elif instance_id in unresolved_ids:
+            prediction["report"] = {"resolved": False}
+        # If not in either list, leave unchanged (no report field added)
+        updated_predictions.append(prediction)
+
+    # Write back to the same file
+    with open(predictions_path, "w") as f:
+        for prediction in updated_predictions:
+            f.write(json.dumps(prediction) + "\n")
diff --git a/tests/test_output_utils.py b/tests/test_output_utils.py
@@ -0,0 +1,244 @@
+"""Tests for output_utils module."""
+
+import json
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from benchmarks.utils.output_utils import add_resolve_rate_to_predictions
+
+
+@pytest.fixture
+def sample_predictions():
+    """Sample predictions data."""
+    return [
+        {"instance_id": "fasterxml/jackson-databind:pr-4469", "model_patch": "patch1"},
+        {"instance_id": "elastic/logstash:pr-15241", "model_patch": "patch2"},
+        {"instance_id": "fasterxml/jackson-databind:pr-1234", "model_patch": "patch3"},
+        {"instance_id": "fasterxml/jackson-databind:pr-2036", "model_patch": "patch4"},
+    ]
+
+
+@pytest.fixture
+def sample_report():
+    """Sample report data."""
+    return {
+        "total_instances": 9,
+        "submitted_instances": 9,
+        "completed_instances": 9,
+        "resolved_instances": 3,
+        "unresolved_instances": 6,
+        "resolved_ids": [
+            "fasterxml/jackson-databind:pr-2036",
+            "fasterxml/jackson-core:pr-1016",
+            "fasterxml/jackson-databind:pr-4228",
+        ],
+        "unresolved_ids": [
+            "fasterxml/jackson-core:pr-964",
+            "fasterxml/jackson-databind:pr-4469",
+            "elastic/logstash:pr-13997",
+            "elastic/logstash:pr-16079",
+            "elastic/logstash:pr-15241",
+            "elastic/logstash:pr-14970",
+        ],
+    }
+
+
+def test_add_resolve_rate_to_predictions(sample_predictions, sample_report):
+    """Test that resolution status is correctly added to predictions."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = Path(tmpdir) / "predictions.jsonl"
+        report_path = Path(tmpdir) / "report.json"
+
+        # Write predictions
+        with open(predictions_path, "w") as f:
+            for pred in sample_predictions:
+                f.write(json.dumps(pred) + "\n")
+
+        # Write report
+        with open(report_path, "w") as f:
+            json.dump(sample_report, f)
+
+        # Run the function
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated predictions
+        updated_predictions = []
+        with open(predictions_path, "r") as f:
+            for line in f:
+                updated_predictions.append(json.loads(line))
+
+        # Verify results
+        assert len(updated_predictions) == 4
+
+        # pr-4469 should be unresolved
+        pred_4469 = next(
+            p
+            for p in updated_predictions
+            if p["instance_id"] == "fasterxml/jackson-databind:pr-4469"
+        )
+        assert pred_4469["report"] == {"resolved": False}
+
+        # pr-15241 should be unresolved
+        pred_15241 = next(
+            p
+            for p in updated_predictions
+            if p["instance_id"] == "elastic/logstash:pr-15241"
+        )
+        assert pred_15241["report"] == {"resolved": False}
+
+        # pr-1234 should have no report (not in either list)
+        pred_1234 = next(
+            p
+            for p in updated_predictions
+            if p["instance_id"] == "fasterxml/jackson-databind:pr-1234"
+        )
+        assert "report" not in pred_1234
+
+        # pr-2036 should be resolved
+        pred_2036 = next(
+            p
+            for p in updated_predictions
+            if p["instance_id"] == "fasterxml/jackson-databind:pr-2036"
+        )
+        assert pred_2036["report"] == {"resolved": True}
+
+
+def test_add_resolve_rate_preserves_other_fields(sample_report):
+    """Test that other fields in predictions are preserved."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = Path(tmpdir) / "predictions.jsonl"
+        report_path = Path(tmpdir) / "report.json"
+
+        # Write prediction with extra fields
+        prediction = {
+            "instance_id": "fasterxml/jackson-databind:pr-2036",
+            "model_patch": "some patch",
+            "extra_field": "extra_value",
+            "nested": {"key": "value"},
+        }
+        with open(predictions_path, "w") as f:
+            f.write(json.dumps(prediction) + "\n")
+
+        # Write report
+        with open(report_path, "w") as f:
+            json.dump(sample_report, f)
+
+        # Run the function
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated prediction
+        with open(predictions_path, "r") as f:
+            updated = json.loads(f.readline())
+
+        # Verify all original fields are preserved
+        assert updated["instance_id"] == "fasterxml/jackson-databind:pr-2036"
+        assert updated["model_patch"] == "some patch"
+        assert updated["extra_field"] == "extra_value"
+        assert updated["nested"] == {"key": "value"}
+        assert updated["report"] == {"resolved": True}
+
+
+def test_add_resolve_rate_empty_predictions(sample_report):
+    """Test handling of empty predictions file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = Path(tmpdir) / "predictions.jsonl"
+        report_path = Path(tmpdir) / "report.json"
+
+        # Write empty predictions file
+        with open(predictions_path, "w") as f:
+            pass
+
+        # Write report
+        with open(report_path, "w") as f:
+            json.dump(sample_report, f)
+
+        # Run the function
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated predictions
+        with open(predictions_path, "r") as f:
+            content = f.read()
+
+        assert content == ""
+
+
+def test_add_resolve_rate_empty_report_lists():
+    """Test handling of report with empty resolved/unresolved lists."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = Path(tmpdir) / "predictions.jsonl"
+        report_path = Path(tmpdir) / "report.json"
+
+        # Write predictions
+        prediction = {"instance_id": "test-instance", "data": "test"}
+        with open(predictions_path, "w") as f:
+            f.write(json.dumps(prediction) + "\n")
+
+        # Write report with empty lists
+        report = {"resolved_ids": [], "unresolved_ids": []}
+        with open(report_path, "w") as f:
+            json.dump(report, f)
+
+        # Run the function
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated prediction
+        with open(predictions_path, "r") as f:
+            updated = json.loads(f.readline())
+
+        # No report should be added since instance is not in either list
+        assert "report" not in updated
+
+
+def test_add_resolve_rate_with_string_paths(sample_predictions, sample_report):
+    """Test that function works with string paths."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = str(Path(tmpdir) / "predictions.jsonl")
+        report_path = str(Path(tmpdir) / "report.json")
+
+        # Write predictions
+        with open(predictions_path, "w") as f:
+            for pred in sample_predictions:
+                f.write(json.dumps(pred) + "\n")
+
+        # Write report
+        with open(report_path, "w") as f:
+            json.dump(sample_report, f)
+
+        # Run the function with string paths
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated predictions
+        with open(predictions_path, "r") as f:
+            updated = json.loads(f.readline())
+
+        # Verify it worked
+        assert updated["report"] == {"resolved": False}
+
+
+def test_add_resolve_rate_missing_keys_in_report():
+    """Test handling of report missing resolved_ids or unresolved_ids keys."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        predictions_path = Path(tmpdir) / "predictions.jsonl"
+        report_path = Path(tmpdir) / "report.json"
+
+        # Write predictions
+        prediction = {"instance_id": "test-instance", "data": "test"}
+        with open(predictions_path, "w") as f:
+            f.write(json.dumps(prediction) + "\n")
+
+        # Write report without resolved_ids and unresolved_ids
+        report = {"total_instances": 1}
+        with open(report_path, "w") as f:
+            json.dump(report, f)
+
+        # Run the function - should not raise
+        add_resolve_rate_to_predictions(predictions_path, report_path)
+
+        # Read updated prediction
+        with open(predictions_path, "r") as f:
+            updated = json.loads(f.readline())
+
+        # No report should be added
+        assert "report" not in updated