From 6754fbbfea66bcc4dd171bb719d7b939200925cd Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Mon, 5 Jan 2026 16:19:39 -0700
Subject: [PATCH 1/8] initial

---
 tests/integration/README.md    | 60 ++++++++++++++++++++++++++++++++
 tests/integration/base.py      | 63 ++++++++++++++++++++++++++++++----
 tests/integration/run_infer.py | 44 ++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index b46722aaa5..7f2015b535 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -50,8 +50,32 @@ uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_pr
 
 # Run a specific test
 uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' --eval-ids t01_fix_simple_typo
+
+# Run with LLM message recording (for debugging)
+LLM_MESSAGES_DIR=./llm_messages uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}'
+```
+
+### Recording LLM Completions
+
+For debugging and analysis, you can record all LLM request/response messages by setting the `LLM_MESSAGES_DIR` environment variable:
+
+```bash
+# Record LLM messages to a specific directory
+export LLM_MESSAGES_DIR=/path/to/store/messages
+uv run python tests/integration/run_infer.py --llm-config '{"model": "..."}'
 ```
 
+When enabled, this feature:
+- Saves all LLM messages from each test to JSON files (`{test_id}_llm_messages.json`)
+- Stores files in the specified directory
+- Copies them to the test output directory alongside logs
+- Does not record anything if the environment variable is not set
+
+This is useful for:
+- Debugging test failures by inspecting exact LLM inputs/outputs
+- Analyzing agent behavior across multiple conversation turns
+- Creating datasets for evaluation or training
+
 ## Automated Testing with GitHub Actions
 
 The integration tests are automatically executed via GitHub Actions using the workflow defined in `.github/workflows/integration-runner.yml`.
@@ -79,6 +103,7 @@ These tests must pass for releases and verify that the agent can successfully co
 - **t07_interactive_commands** - Tests interactive command handling
 - **t08_image_file_viewing** - Tests image file viewing capabilities
 - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded
+- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing
 
 ### Behavior Tests (`b*.py`) - **Optional**
 
@@ -106,3 +131,38 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`)
   - Override to limit LLM calls for faster tests
   - Useful for tests that should complete quickly
+
+### Optional Methods
+
+- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5)
+  - Override this method to customize conversation flow for multi-step tests
+  - Default implementation sends a single instruction and runs to completion
+  - Provides access to `self.conversation` for direct manipulation
+  - Example use cases:
+    - Send multiple messages in sequence
+    - Verify intermediate state between conversation phases
+    - Test complex multi-turn interactions
+    - Trigger condensation at specific points
+
+#### Example: Multi-Step Test
+
+```python
+class MultiStepTest(BaseIntegrationTest):
+    def run_conversation(self) -> None:
+        # Step 1: Initial task
+        self.conversation.send_message(
+            Message(role="user", content=[TextContent(text="First, create a file")])
+        )
+        self.conversation.run()
+
+        # Intermediate verification
+        assert os.path.exists(os.path.join(self.workspace, "file.txt"))
+
+        # Step 2: Follow-up task
+        self.conversation.send_message(
+            Message(role="user", content=[TextContent(text="Now modify the file")])
+        )
+        self.conversation.run()
+```
+
+See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`.
diff --git a/tests/integration/base.py b/tests/integration/base.py
index c633d367af..7ce4ee5c3b 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -2,6 +2,7 @@
 Base classes for agent-sdk integration tests.
 """
 
+import json
 import os
 import sys
 from abc import ABC, abstractmethod
@@ -102,6 +103,11 @@ def __init__(
             self.workspace, f"{self.instance_id}_agent_logs.txt"
         )
 
+        # Create LLM messages file path for this test instance
+        self.llm_messages_file_path: str = os.path.join(
+            self.workspace, f"{self.instance_id}_llm_messages.json"
+        )
+
         # Early stopping support - must be initialized BEFORE LocalConversation
         # since the callback may access these attributes
         self.early_stopper: EarlyStopperBase | None = None
@@ -128,6 +134,40 @@ def conversation_callback(self, event: Event):
                 self.early_stop_result = result
                 self.conversation.pause()  # Trigger graceful stop
 
+    def run_conversation(self) -> None:
+        """
+        Execute the conversation with the agent.
+
+        Override this method to customize the conversation flow (e.g., multiple steps,
+        intermediate verification, or custom message sequences). The default implementation
+        sends a single instruction and runs the conversation to completion.
+
+        You have access to:
+        - self.conversation: LocalConversation instance to send messages and control flow
+        - self.instruction: The instruction string for the test
+        - self.collected_events: Events collected so far (via callback)
+        - self.llm_messages: LLM messages collected so far (via callback)
+
+        Example override for multi-step test:
+            def run_conversation(self) -> None:
+                # Step 1
+                self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
+                self.conversation.run()
+
+                # Intermediate verification
+                assert some_condition()
+
+                # Step 2
+                self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
+                self.conversation.run()
+        """
+        self.conversation.send_message(
+            message=Message(
+                role="user", content=[TextContent(text=self.instruction)]
+            )
+        )
+        self.conversation.run()
+
     def run_instruction(self) -> TestResult:
         """
         Run user instruction through the agent and verify results.
@@ -149,12 +189,7 @@ def run_instruction(self) -> TestResult:
             stderr_buffer = StringIO()
 
             with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
-                self.conversation.send_message(
-                    message=Message(
-                        role="user", content=[TextContent(text=self.instruction)]
-                    )
-                )
-                self.conversation.run()
+                self.run_conversation()
 
             # Save captured output to log file
             captured_output = stdout_buffer.getvalue()
@@ -176,6 +211,22 @@ def run_instruction(self) -> TestResult:
             if captured_errors:
                 print(captured_errors, file=sys.stderr, end="")
 
+            # Save LLM messages if LLM_MESSAGES_DIR is set
+            llm_messages_dir = os.getenv("LLM_MESSAGES_DIR")
+            if llm_messages_dir:
+                # Create directory if it doesn't exist
+                os.makedirs(llm_messages_dir, exist_ok=True)
+
+                # Save LLM messages to JSON file in the specified directory
+                llm_messages_path = os.path.join(
+                    llm_messages_dir, f"{self.instance_id}_llm_messages.json"
+                )
+                with open(llm_messages_path, "w") as f:
+                    json.dump(self.llm_messages, f, indent=2)
+
+                # Update the file path so it can be accessed by run_infer.py
+                self.llm_messages_file_path = llm_messages_path
+
             # Check if early stopped - skip full verification
             if self.early_stop_result:
                 return TestResult(
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
index 8deb07568c..5587c3331c 100755
--- a/tests/integration/run_infer.py
+++ b/tests/integration/run_infer.py
@@ -52,6 +52,7 @@ class EvalOutput(BaseModel):
     token_usage: TokenUsageData | None = None
     error_message: str | None = None
     log_file_path: str | None = None
+    llm_messages_file_path: str | None = None
 
     @property
     def required(self) -> bool:
@@ -216,6 +217,35 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
                 permanent_log_path,
             )
 
+        # Copy LLM messages file to a location that will be preserved (if it exists)
+        llm_messages_file_path = None
+        if hasattr(test_instance, "llm_messages_file_path") and os.path.exists(
+            test_instance.llm_messages_file_path
+        ):
+            # Create a permanent logs directory if not already created
+            permanent_logs_dir = os.path.join(os.getcwd(), "integration_test_logs")
+            os.makedirs(permanent_logs_dir, exist_ok=True)
+
+            # Create a unique filename to avoid conflicts
+            permanent_llm_messages_filename = (
+                f"{instance.instance_id}_llm_messages.json"
+            )
+            permanent_llm_messages_path = os.path.join(
+                permanent_logs_dir, permanent_llm_messages_filename
+            )
+
+            # Copy the LLM messages file
+            shutil.copy2(
+                test_instance.llm_messages_file_path, permanent_llm_messages_path
+            )
+            llm_messages_file_path = permanent_llm_messages_path
+
+            logger.info(
+                "Preserved LLM messages file for %s at %s",
+                instance.instance_id,
+                permanent_llm_messages_path,
+            )
+
         return EvalOutput(
             instance_id=instance.instance_id,
             test_result=test_result,
@@ -224,6 +254,7 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
             cost=llm_cost,
             token_usage=eval_token_usage,
             log_file_path=log_file_path,
+            llm_messages_file_path=llm_messages_file_path,
         )
 
     except SkipTest as e:
@@ -342,6 +373,19 @@ def generate_structured_results(
                 eval_output.log_file_path,
             )
 
+        # Also copy LLM messages file if it exists
+        if eval_output.llm_messages_file_path and os.path.exists(
+            eval_output.llm_messages_file_path
+        ):
+            llm_messages_filename = f"{eval_output.instance_id}_llm_messages.json"
+            dest_path = os.path.join(logs_dir, llm_messages_filename)
+            shutil.copy2(eval_output.llm_messages_file_path, dest_path)
+            logger.info(
+                "Copied LLM messages file for %s to %s",
+                eval_output.instance_id,
+                dest_path,
+            )
+
     # Print summary for console output
     success_rate = structured_results.success_rate
     successful = structured_results.successful_tests

From 06563293982f2ec3b4e12db64ef4f05b57a71921 Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Mon, 5 Jan 2026 16:23:57 -0700
Subject: [PATCH 2/8] renaming local vars and removing ref to unused test

---
 tests/integration/README.md | 17 +++++++----------
 tests/integration/base.py   | 23 +++++++++++++----------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index 7f2015b535..27ae0a2e80 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -103,7 +103,6 @@ These tests must pass for releases and verify that the agent can successfully co
 - **t07_interactive_commands** - Tests interactive command handling
 - **t08_image_file_viewing** - Tests image file viewing capabilities
 - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded
-- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing
 
 ### Behavior Tests (`b*.py`) - **Optional**
 
@@ -134,10 +133,10 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 
 ### Optional Methods
 
-- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5)
+- **`run_conversation(conversation)`** - Execute the conversation with the agent (new in v1.7.5)
   - Override this method to customize conversation flow for multi-step tests
   - Default implementation sends a single instruction and runs to completion
-  - Provides access to `self.conversation` for direct manipulation
+  - Receives the conversation object as a parameter for cleaner API
   - Example use cases:
     - Send multiple messages in sequence
     - Verify intermediate state between conversation phases
@@ -148,21 +147,19 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 
 ```python
 class MultiStepTest(BaseIntegrationTest):
-    def run_conversation(self) -> None:
+    def run_conversation(self, conversation: LocalConversation) -> None:
         # Step 1: Initial task
-        self.conversation.send_message(
+        conversation.send_message(
             Message(role="user", content=[TextContent(text="First, create a file")])
         )
-        self.conversation.run()
+        conversation.run()
 
         # Intermediate verification
         assert os.path.exists(os.path.join(self.workspace, "file.txt"))
 
         # Step 2: Follow-up task
-        self.conversation.send_message(
+        conversation.send_message(
             Message(role="user", content=[TextContent(text="Now modify the file")])
         )
-        self.conversation.run()
+        conversation.run()
 ```
-
-See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`.
diff --git a/tests/integration/base.py b/tests/integration/base.py
index 7ce4ee5c3b..13b19ca7ee 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -134,7 +134,7 @@ def conversation_callback(self, event: Event):
                 self.early_stop_result = result
                 self.conversation.pause()  # Trigger graceful stop
 
-    def run_conversation(self) -> None:
+    def run_conversation(self, conversation: LocalConversation) -> None:
         """
         Execute the conversation with the agent.
 
@@ -142,31 +142,34 @@ def run_conversation(self) -> None:
         intermediate verification, or custom message sequences). The default implementation
         sends a single instruction and runs the conversation to completion.
 
+        Args:
+            conversation: The LocalConversation instance to send messages and control flow
+
         You have access to:
-        - self.conversation: LocalConversation instance to send messages and control flow
+        - conversation: LocalConversation parameter to send messages and control flow
         - self.instruction: The instruction string for the test
         - self.collected_events: Events collected so far (via callback)
         - self.llm_messages: LLM messages collected so far (via callback)
 
         Example override for multi-step test:
-            def run_conversation(self) -> None:
+            def run_conversation(self, conversation: LocalConversation) -> None:
                 # Step 1
-                self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
-                self.conversation.run()
+                conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
+                conversation.run()
 
                 # Intermediate verification
                 assert some_condition()
 
                 # Step 2
-                self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
-                self.conversation.run()
+                conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
+                conversation.run()
         """
-        self.conversation.send_message(
+        conversation.send_message(
             message=Message(
                 role="user", content=[TextContent(text=self.instruction)]
             )
         )
-        self.conversation.run()
+        conversation.run()
 
     def run_instruction(self) -> TestResult:
         """
@@ -189,7 +192,7 @@ def run_instruction(self) -> TestResult:
             stderr_buffer = StringIO()
 
             with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
-                self.run_conversation()
+                self.run_conversation(self.conversation)
 
             # Save captured output to log file
             captured_output = stdout_buffer.getvalue()

From bbd17a1d0515abb4960ff02dea97e3c28dafdc65 Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:02:06 -0700
Subject: [PATCH 3/8] Revert "renaming local vars and removing ref to unused
 test"

This reverts commit 06563293982f2ec3b4e12db64ef4f05b57a71921.
---
 tests/integration/README.md | 17 ++++++++++-------
 tests/integration/base.py   | 23 ++++++++++-------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index 27ae0a2e80..7f2015b535 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -103,6 +103,7 @@ These tests must pass for releases and verify that the agent can successfully co
 - **t07_interactive_commands** - Tests interactive command handling
 - **t08_image_file_viewing** - Tests image file viewing capabilities
 - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded
+- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing
 
 ### Behavior Tests (`b*.py`) - **Optional**
 
@@ -133,10 +134,10 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 
 ### Optional Methods
 
-- **`run_conversation(conversation)`** - Execute the conversation with the agent (new in v1.7.5)
+- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5)
   - Override this method to customize conversation flow for multi-step tests
   - Default implementation sends a single instruction and runs to completion
-  - Receives the conversation object as a parameter for cleaner API
+  - Provides access to `self.conversation` for direct manipulation
   - Example use cases:
     - Send multiple messages in sequence
     - Verify intermediate state between conversation phases
@@ -147,19 +148,21 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 
 ```python
 class MultiStepTest(BaseIntegrationTest):
-    def run_conversation(self, conversation: LocalConversation) -> None:
+    def run_conversation(self) -> None:
         # Step 1: Initial task
-        conversation.send_message(
+        self.conversation.send_message(
             Message(role="user", content=[TextContent(text="First, create a file")])
         )
-        conversation.run()
+        self.conversation.run()
 
         # Intermediate verification
         assert os.path.exists(os.path.join(self.workspace, "file.txt"))
 
         # Step 2: Follow-up task
-        conversation.send_message(
+        self.conversation.send_message(
             Message(role="user", content=[TextContent(text="Now modify the file")])
         )
-        conversation.run()
+        self.conversation.run()
 ```
+
+See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`.
diff --git a/tests/integration/base.py b/tests/integration/base.py
index 13b19ca7ee..7ce4ee5c3b 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -134,7 +134,7 @@ def conversation_callback(self, event: Event):
                 self.early_stop_result = result
                 self.conversation.pause()  # Trigger graceful stop
 
-    def run_conversation(self, conversation: LocalConversation) -> None:
+    def run_conversation(self) -> None:
         """
         Execute the conversation with the agent.
 
@@ -142,34 +142,31 @@ def run_conversation(self, conversation: LocalConversation) -> None:
         intermediate verification, or custom message sequences). The default implementation
         sends a single instruction and runs the conversation to completion.
 
-        Args:
-            conversation: The LocalConversation instance to send messages and control flow
-
         You have access to:
-        - conversation: LocalConversation parameter to send messages and control flow
+        - self.conversation: LocalConversation instance to send messages and control flow
         - self.instruction: The instruction string for the test
         - self.collected_events: Events collected so far (via callback)
         - self.llm_messages: LLM messages collected so far (via callback)
 
         Example override for multi-step test:
-            def run_conversation(self, conversation: LocalConversation) -> None:
+            def run_conversation(self) -> None:
                 # Step 1
-                conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
-                conversation.run()
+                self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
+                self.conversation.run()
 
                 # Intermediate verification
                 assert some_condition()
 
                 # Step 2
-                conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
-                conversation.run()
+                self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
+                self.conversation.run()
         """
-        conversation.send_message(
+        self.conversation.send_message(
             message=Message(
                 role="user", content=[TextContent(text=self.instruction)]
             )
         )
-        conversation.run()
+        self.conversation.run()
 
     def run_instruction(self) -> TestResult:
         """
@@ -192,7 +189,7 @@ def run_instruction(self) -> TestResult:
             stderr_buffer = StringIO()
 
             with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
-                self.run_conversation(self.conversation)
+                self.run_conversation()
 
             # Save captured output to log file
             captured_output = stdout_buffer.getvalue()

From f79e6645e70f750c89c52421ed0da0a04854c0e0 Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:02:08 -0700
Subject: [PATCH 4/8] Revert "initial"

This reverts commit 6754fbbfea66bcc4dd171bb719d7b939200925cd.
---
 tests/integration/README.md    | 60 --------------------------------
 tests/integration/base.py      | 63 ++++------------------------------
 tests/integration/run_infer.py | 44 ------------------------
 3 files changed, 6 insertions(+), 161 deletions(-)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index 7f2015b535..b46722aaa5 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -50,32 +50,8 @@ uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_pr
 
 # Run a specific test
 uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' --eval-ids t01_fix_simple_typo
-
-# Run with LLM message recording (for debugging)
-LLM_MESSAGES_DIR=./llm_messages uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}'
-```
-
-### Recording LLM Completions
-
-For debugging and analysis, you can record all LLM request/response messages by setting the `LLM_MESSAGES_DIR` environment variable:
-
-```bash
-# Record LLM messages to a specific directory
-export LLM_MESSAGES_DIR=/path/to/store/messages
-uv run python tests/integration/run_infer.py --llm-config '{"model": "..."}'
 ```
 
-When enabled, this feature:
-- Saves all LLM messages from each test to JSON files (`{test_id}_llm_messages.json`)
-- Stores files in the specified directory
-- Copies them to the test output directory alongside logs
-- Does not record anything if the environment variable is not set
-
-This is useful for:
-- Debugging test failures by inspecting exact LLM inputs/outputs
-- Analyzing agent behavior across multiple conversation turns
-- Creating datasets for evaluation or training
-
 ## Automated Testing with GitHub Actions
 
 The integration tests are automatically executed via GitHub Actions using the workflow defined in `.github/workflows/integration-runner.yml`.
@@ -103,7 +79,6 @@ These tests must pass for releases and verify that the agent can successfully co
 - **t07_interactive_commands** - Tests interactive command handling
 - **t08_image_file_viewing** - Tests image file viewing capabilities
 - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded
-- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing
 
 ### Behavior Tests (`b*.py`) - **Optional**
 
@@ -131,38 +106,3 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`)
   - Override to limit LLM calls for faster tests
   - Useful for tests that should complete quickly
-
-### Optional Methods
-
-- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5)
-  - Override this method to customize conversation flow for multi-step tests
-  - Default implementation sends a single instruction and runs to completion
-  - Provides access to `self.conversation` for direct manipulation
-  - Example use cases:
-    - Send multiple messages in sequence
-    - Verify intermediate state between conversation phases
-    - Test complex multi-turn interactions
-    - Trigger condensation at specific points
-
-#### Example: Multi-Step Test
-
-```python
-class MultiStepTest(BaseIntegrationTest):
-    def run_conversation(self) -> None:
-        # Step 1: Initial task
-        self.conversation.send_message(
-            Message(role="user", content=[TextContent(text="First, create a file")])
-        )
-        self.conversation.run()
-
-        # Intermediate verification
-        assert os.path.exists(os.path.join(self.workspace, "file.txt"))
-
-        # Step 2: Follow-up task
-        self.conversation.send_message(
-            Message(role="user", content=[TextContent(text="Now modify the file")])
-        )
-        self.conversation.run()
-```
-
-See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`.
diff --git a/tests/integration/base.py b/tests/integration/base.py
index 7ce4ee5c3b..c633d367af 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -2,7 +2,6 @@
 Base classes for agent-sdk integration tests.
 """
 
-import json
 import os
 import sys
 from abc import ABC, abstractmethod
@@ -103,11 +102,6 @@ def __init__(
             self.workspace, f"{self.instance_id}_agent_logs.txt"
         )
 
-        # Create LLM messages file path for this test instance
-        self.llm_messages_file_path: str = os.path.join(
-            self.workspace, f"{self.instance_id}_llm_messages.json"
-        )
-
         # Early stopping support - must be initialized BEFORE LocalConversation
         # since the callback may access these attributes
         self.early_stopper: EarlyStopperBase | None = None
@@ -134,40 +128,6 @@ def conversation_callback(self, event: Event):
                 self.early_stop_result = result
                 self.conversation.pause()  # Trigger graceful stop
 
-    def run_conversation(self) -> None:
-        """
-        Execute the conversation with the agent.
-
-        Override this method to customize the conversation flow (e.g., multiple steps,
-        intermediate verification, or custom message sequences). The default implementation
-        sends a single instruction and runs the conversation to completion.
-
-        You have access to:
-        - self.conversation: LocalConversation instance to send messages and control flow
-        - self.instruction: The instruction string for the test
-        - self.collected_events: Events collected so far (via callback)
-        - self.llm_messages: LLM messages collected so far (via callback)
-
-        Example override for multi-step test:
-            def run_conversation(self) -> None:
-                # Step 1
-                self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")]))
-                self.conversation.run()
-
-                # Intermediate verification
-                assert some_condition()
-
-                # Step 2
-                self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")]))
-                self.conversation.run()
-        """
-        self.conversation.send_message(
-            message=Message(
-                role="user", content=[TextContent(text=self.instruction)]
-            )
-        )
-        self.conversation.run()
-
     def run_instruction(self) -> TestResult:
         """
         Run user instruction through the agent and verify results.
@@ -189,7 +149,12 @@ def run_instruction(self) -> TestResult:
             stderr_buffer = StringIO()
 
             with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
-                self.run_conversation()
+                self.conversation.send_message(
+                    message=Message(
+                        role="user", content=[TextContent(text=self.instruction)]
+                    )
+                )
+                self.conversation.run()
 
             # Save captured output to log file
             captured_output = stdout_buffer.getvalue()
@@ -211,22 +176,6 @@ def run_instruction(self) -> TestResult:
             if captured_errors:
                 print(captured_errors, file=sys.stderr, end="")
 
-            # Save LLM messages if LLM_MESSAGES_DIR is set
-            llm_messages_dir = os.getenv("LLM_MESSAGES_DIR")
-            if llm_messages_dir:
-                # Create directory if it doesn't exist
-                os.makedirs(llm_messages_dir, exist_ok=True)
-
-                # Save LLM messages to JSON file in the specified directory
-                llm_messages_path = os.path.join(
-                    llm_messages_dir, f"{self.instance_id}_llm_messages.json"
-                )
-                with open(llm_messages_path, "w") as f:
-                    json.dump(self.llm_messages, f, indent=2)
-
-                # Update the file path so it can be accessed by run_infer.py
-                self.llm_messages_file_path = llm_messages_path
-
             # Check if early stopped - skip full verification
             if self.early_stop_result:
                 return TestResult(
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
index 5587c3331c..8deb07568c 100755
--- a/tests/integration/run_infer.py
+++ b/tests/integration/run_infer.py
@@ -52,7 +52,6 @@ class EvalOutput(BaseModel):
     token_usage: TokenUsageData | None = None
     error_message: str | None = None
     log_file_path: str | None = None
-    llm_messages_file_path: str | None = None
 
     @property
     def required(self) -> bool:
@@ -217,35 +216,6 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
                 permanent_log_path,
             )
 
-        # Copy LLM messages file to a location that will be preserved (if it exists)
-        llm_messages_file_path = None
-        if hasattr(test_instance, "llm_messages_file_path") and os.path.exists(
-            test_instance.llm_messages_file_path
-        ):
-            # Create a permanent logs directory if not already created
-            permanent_logs_dir = os.path.join(os.getcwd(), "integration_test_logs")
-            os.makedirs(permanent_logs_dir, exist_ok=True)
-
-            # Create a unique filename to avoid conflicts
-            permanent_llm_messages_filename = (
-                f"{instance.instance_id}_llm_messages.json"
-            )
-            permanent_llm_messages_path = os.path.join(
-                permanent_logs_dir, permanent_llm_messages_filename
-            )
-
-            # Copy the LLM messages file
-            shutil.copy2(
-                test_instance.llm_messages_file_path, permanent_llm_messages_path
-            )
-            llm_messages_file_path = permanent_llm_messages_path
-
-            logger.info(
-                "Preserved LLM messages file for %s at %s",
-                instance.instance_id,
-                permanent_llm_messages_path,
-            )
-
         return EvalOutput(
             instance_id=instance.instance_id,
             test_result=test_result,
@@ -254,7 +224,6 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
             cost=llm_cost,
             token_usage=eval_token_usage,
             log_file_path=log_file_path,
-            llm_messages_file_path=llm_messages_file_path,
         )
 
     except SkipTest as e:
@@ -373,19 +342,6 @@ def generate_structured_results(
                 eval_output.log_file_path,
             )
 
-        # Also copy LLM messages file if it exists
-        if eval_output.llm_messages_file_path and os.path.exists(
-            eval_output.llm_messages_file_path
-        ):
-            llm_messages_filename = f"{eval_output.instance_id}_llm_messages.json"
-            dest_path = os.path.join(logs_dir, llm_messages_filename)
-            shutil.copy2(eval_output.llm_messages_file_path, dest_path)
-            logger.info(
-                "Copied LLM messages file for %s to %s",
-                eval_output.instance_id,
-                dest_path,
-            )
-
     # Print summary for console output
     success_rate = structured_results.success_rate
     successful = structured_results.successful_tests

From 6766c78c1a6a85b398afd0b7f300adc9572fbd0c Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:12:08 -0700
Subject: [PATCH 5/8] run_instruction broken out to support conversation
 manipulation

---
 tests/integration/base.py      | 18 +++++++++++-------
 tests/integration/run_infer.py |  2 +-
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/integration/base.py b/tests/integration/base.py
index c633d367af..9ba21164f6 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -128,7 +128,7 @@ def conversation_callback(self, event: Event):
                 self.early_stop_result = result
                 self.conversation.pause()  # Trigger graceful stop
 
-    def run_instruction(self) -> TestResult:
+    def run_integration_test(self) -> TestResult:
         """
         Run user instruction through the agent and verify results.
 
@@ -149,12 +149,7 @@ def run_instruction(self) -> TestResult:
             stderr_buffer = StringIO()
 
             with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
-                self.conversation.send_message(
-                    message=Message(
-                        role="user", content=[TextContent(text=self.instruction)]
-                    )
-                )
-                self.conversation.run()
+                self.run_instructions(self.conversation)
 
             # Save captured output to log file
             captured_output = stdout_buffer.getvalue()
@@ -194,6 +189,15 @@ def run_instruction(self) -> TestResult:
         finally:
             self.teardown()
 
+    def run_instructions(self, conversation: LocalConversation) -> None:
+        """Feed user instructions to the agent and manage the conversation."""        
+        conversation.send_message(
+            message=Message(
+                role="user", content=[TextContent(text=self.instruction)]
+            )
+        )
+        conversation.run()
+
     @property
     @abstractmethod
     def tools(self) -> list[Tool]:
diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
index 8deb07568c..3000e568ce 100755
--- a/tests/integration/run_infer.py
+++ b/tests/integration/run_infer.py
@@ -148,7 +148,7 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
 
         # Run the test
         start_time = time.time()
-        test_result = test_instance.run_instruction()
+        test_result = test_instance.run_integration_test()
         end_time = time.time()
 
         # Access accumulated_cost from the metrics object where it's properly validated

From aadef4040ef272c5b245aec62c69f6fb36fb1479 Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:19:45 -0700
Subject: [PATCH 6/8] minor readme

---
 tests/integration/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/integration/README.md b/tests/integration/README.md
index b46722aaa5..322c7475ec 100644
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@@ -106,3 +106,9 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base
 - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`)
   - Override to limit LLM calls for faster tests
   - Useful for tests that should complete quickly
+
+### Conversation Control
+
+The standard way to define an integration test is to set the `INSTRUCTION` class variable. These instructions are sent to the agent as the first user message.
+
+However, if the functionality being tested requires multiple instructions or accessing the conversation object mid-test then the test can instead be defined by overriding the `run_instructions` method. This method provides a `LocalConversation` object that can be manipulated directly by sending messages, triggering condensations, and the like.
\ No newline at end of file

From e1a357e1e8cc466ff47167bee2ebe857f1e2f76c Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:28:58 -0700
Subject: [PATCH 7/8] instruction message property

---
 tests/integration/base.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/integration/base.py b/tests/integration/base.py
index 9ba21164f6..67a22cf770 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -191,13 +191,16 @@ def run_integration_test(self) -> TestResult:
 
     def run_instructions(self, conversation: LocalConversation) -> None:
         """Feed user instructions to the agent and manage the conversation."""        
-        conversation.send_message(
-            message=Message(
-                role="user", content=[TextContent(text=self.instruction)]
-            )
-        )
+        conversation.send_message(message=self.instruction_message)
         conversation.run()
 
+    @property
+    def instruction_message(self) -> Message:
+        """The initial instruction message for the agent."""
+        return Message(
+            role="user", content=[TextContent(text=self.instruction)]
+        )
+
     @property
     @abstractmethod
     def tools(self) -> list[Tool]:

From d3aee66cf7ea8e893df4caa8691c3d90ffc52a39 Mon Sep 17 00:00:00 2001
From: Calvin Smith <calvin@all-hands.dev>
Date: Tue, 6 Jan 2026 08:42:02 -0700
Subject: [PATCH 8/8] linting

---
 tests/integration/base.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/integration/base.py b/tests/integration/base.py
index 67a22cf770..6f7a759aaf 100644
--- a/tests/integration/base.py
+++ b/tests/integration/base.py
@@ -190,16 +190,14 @@ def run_integration_test(self) -> TestResult:
             self.teardown()
 
     def run_instructions(self, conversation: LocalConversation) -> None:
-        """Feed user instructions to the agent and manage the conversation."""        
+        """Feed user instructions to the agent and manage the conversation."""
         conversation.send_message(message=self.instruction_message)
         conversation.run()
 
     @property
     def instruction_message(self) -> Message:
         """The initial instruction message for the agent."""
-        return Message(
-            role="user", content=[TextContent(text=self.instruction)]
-        )
+        return Message(role="user", content=[TextContent(text=self.instruction)])
 
     @property
     @abstractmethod