google
diff --git a/‎python/agents/swe-benchmark-agent/README.md‎
Lines changed: 145 additions & 0 deletions b/‎python/agents/swe-benchmark-agent/README.md‎
Lines changed: 145 additions & 0 deletions
diff --git a/‎python/agents/swe-benchmark-agent/pyproject.toml‎
Lines changed: 43 additions & 0 deletions b/‎python/agents/swe-benchmark-agent/pyproject.toml‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎python/agents/swe-benchmark-agent/swe_benchmark_agent/__init__.py‎
Lines changed: 21 additions & 0 deletions b/‎python/agents/swe-benchmark-agent/swe_benchmark_agent/__init__.py‎
Lines changed: 21 additions & 0 deletions
@@ -0,0 +1,145 @@
+# SWE Benchmark Agent
+
+## Overview
+
+This agent is designed to tackle software engineering problems from two prominent benchmarks: SWE-bench and TerminalBench.
+
+## Agent Details
+
+| Feature | Description |
+| --- | --- |
+| **Interaction Type** | Autonomous |
+| **Complexity**  | Advanced |
+| **Agent Type**  | Single Agent |
+| **Components**  | Tools: Shell |
+| **Vertical**  | Software Engineering |
+
+### Agent architecture:
+
+The SWE Benchmark Agent uses a sophisticated orchestrator pattern:
+- **Orchestrator**: Manages the agent lifecycle and coordinates tool execution
+- **Environment**: Docker-based isolated execution environment (SWEBenchEnvironment or TerminalBenchEnvironment)
+- **Tools**: File operations (read, edit, create), shell commands, and submission
+- **Agent**: LLM-powered agent (Gemini) with built-in planner and thinking capabilities
+
+The agent operates autonomously within the Docker environment, using shell commands and file operations to solve software engineering tasks.
+
+## Setup and Installation
+
+1.  **Prerequisites**
+
+    *   Python 3.10+
+    *   uv
+        *   For dependency management and packaging. Please follow the
+            instructions on the official
+            [uv website](https://docs.astral.sh/uv/) for installation.
+
+        ```bash
+        curl -LsSf https://astral.sh/uv/install.sh | sh
+        ```
+
+    * A project on Google Cloud Platform
+    * Google Cloud CLI
+        *   For installation, please follow the instruction on the official
+            [Google Cloud website](https://cloud.google.com/sdk/docs/install).
+
+2.  **Installation**
+
+    ```bash
+    # Clone this repository.
+    git clone https://github.com/google/adk-samples.git
+    cd adk-samples/python/agents/swe-benchmark-agent
+    # Install the package and dependencies.
+    uv sync
+    ```
+
+3.  **Configuration**
+
+    *   Set up Google Cloud credentials.
+
+        *   You may set the following environment variables in your shell, or in
+            a `.env` file instead.
+
+        ```bash
+        export GOOGLE_GENAI_USE_VERTEXAI=true
+        export GOOGLE_CLOUD_PROJECT=<your-project-id>
+        export GOOGLE_CLOUD_LOCATION=<your-project-location>
+        ```
+
+
+## Running Tests
+
+For running tests and evaluation, install the extra dependencies:
+
+```bash
+uv sync --dev
+```
+
+Then the tests and evaluation can be run from the `swe-benchmark-agent` directory using
+the `pytest` module:
+
+```bash
+uv run pytest tests
+```
+
+## Running Evaluations
+
+The SWE Agent can be evaluated on both SWE-bench and TerminalBench benchmarks to measure its performance on real-world software engineering tasks.
+
+### SWE-bench Evaluation
+
+To run evaluation on the full SWE-bench Verified dataset:
+
+```bash
+uv run python -m swe_benchmark_agent.main --full-dataset --evaluate --max-workers 4
+```
+
+To evaluate on a specific number of instances (e.g., the first 10):
+
+```bash
+uv run python -m swe_benchmark_agent.main --instance-id-or-count 10 --evaluate
+```
+
+To evaluate on a single instance:
+
+```bash
+uv run python -m swe_benchmark_agent.main --instance-id-or-count django__django-12345 --evaluate
+```
+
+### TerminalBench Evaluation
+
+To run evaluation on the full TerminalBench core dataset:
+
+```bash
+uv run python -m swe_benchmark_agent.main --dataset terminalbench --full-dataset --evaluate --max-workers 4
+```
+
+To evaluate on a specific number of tasks (e.g., the first 5):
+
+```bash
+uv run python -m swe_benchmark_agent.main --dataset terminalbench --instance-id-or-count 5 --evaluate
+```
+
+To evaluate on a single task:
+
+```bash
+uv run python -m swe_benchmark_agent.main --dataset terminalbench --instance-id-or-count blind-maze-explorer-5x5 --evaluate
+```
+
+### Evaluation Results
+
+The following table shows the performance of different Gemini models on SWE-bench and TerminalBench:
+
+| Model | SWEBench-Verified | TerminalBench |
+|-------|-------------------|---------------|
+| Gemini 2.5 Flash | 54% | 23.75% |
+| Gemini 2.5 Pro | 65.6% | 30% |
+| Gemini 2.5 Flash Preview (09/25) | 59% | 32.5% |
+
+## Customization
+
+The SWE Agent can be customized to better suit your requirements. For example:
+
+ 1. **Use a different model:** You can change the model used by the agent by modifying the `main.py` file.
+ 2. **Add more tools:** You can add more tools to the agent to give it more capabilities.
+ 3. **Support more benchmarks:** You can add support for more benchmarks by creating a new environment and updating the `main.py` file.
@@ -0,0 +1,43 @@
+[project]
+name = "swe-benchmark-agent"
+version = "0.1.0"
+description = "A software engineering agent for SWE-bench and Terminal-Bench benchmarks using Google ADK"
+authors = [{ name = "Utsav Garg", email = "[email protected]" }]
+license = "Apache-2.0"
+readme = "README.md"
+requires-python = ">=3.10,<3.14"
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Developers",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3 :: Only",
+]
+
+dependencies = [
+    "swebench @ git+https://github.com/swe-bench/[email protected]#egg=swebench",
+    "typer>=0.19.2",
+    "datasets>=4.2.0",
+    "jinja2>=3.1.5",
+    "GitPython>=3.1.45",
+    "docker>=7.1.0",
+    "google-adk~=1.10.0",
+    "pyyaml>=6.0.2",
+    "python-dotenv>=1.0.1",
+]
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.2",
+    "pytest-asyncio>=0.23.0",
+]
+
+[build-system]
+requires = ["uv_build>=0.8.14,<0.9.0"]
+build-backend = "uv_build"
+
+[tool.uv.build-backend]
+module-root = ""
+
+[tool.pytest.ini_options]
+pythonpath = "."
+asyncio_default_fixture_loop_scope = "function"
@@ -0,0 +1,21 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SWE Agent - Software Engineering Agent for benchmark evaluation.
+
+This package provides a sophisticated agent for solving software engineering
+tasks from SWE-bench and Terminal-Bench benchmarks using Google ADK.
+"""
+
+__version__ = "0.1.0"