From 6754fbbfea66bcc4dd171bb719d7b939200925cd Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Mon, 5 Jan 2026 16:19:39 -0700 Subject: [PATCH 1/8] initial --- tests/integration/README.md | 60 ++++++++++++++++++++++++++++++++ tests/integration/base.py | 63 ++++++++++++++++++++++++++++++---- tests/integration/run_infer.py | 44 ++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 6 deletions(-) diff --git a/tests/integration/README.md b/tests/integration/README.md index b46722aaa5..7f2015b535 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -50,8 +50,32 @@ uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_pr # Run a specific test uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' --eval-ids t01_fix_simple_typo + +# Run with LLM message recording (for debugging) +LLM_MESSAGES_DIR=./llm_messages uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' +``` + +### Recording LLM Completions + +For debugging and analysis, you can record all LLM request/response messages by setting the `LLM_MESSAGES_DIR` environment variable: + +```bash +# Record LLM messages to a specific directory +export LLM_MESSAGES_DIR=/path/to/store/messages +uv run python tests/integration/run_infer.py --llm-config '{"model": "..."}' ``` +When enabled, this feature: +- Saves all LLM messages from each test to JSON files (`{test_id}_llm_messages.json`) +- Stores files in the specified directory +- Copies them to the test output directory alongside logs +- Does not record anything if the environment variable is not set + +This is useful for: +- Debugging test failures by inspecting exact LLM inputs/outputs +- Analyzing agent behavior across multiple conversation turns +- Creating datasets for evaluation or training + ## Automated Testing with GitHub Actions The integration tests are automatically executed via GitHub Actions using the workflow defined in `.github/workflows/integration-runner.yml`. @@ -79,6 +103,7 @@ These tests must pass for releases and verify that the agent can successfully co - **t07_interactive_commands** - Tests interactive command handling - **t08_image_file_viewing** - Tests image file viewing capabilities - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded +- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing ### Behavior Tests (`b*.py`) - **Optional** @@ -106,3 +131,38 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`) - Override to limit LLM calls for faster tests - Useful for tests that should complete quickly + +### Optional Methods + +- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5) + - Override this method to customize conversation flow for multi-step tests + - Default implementation sends a single instruction and runs to completion + - Provides access to `self.conversation` for direct manipulation + - Example use cases: + - Send multiple messages in sequence + - Verify intermediate state between conversation phases + - Test complex multi-turn interactions + - Trigger condensation at specific points + +#### Example: Multi-Step Test + +```python +class MultiStepTest(BaseIntegrationTest): + def run_conversation(self) -> None: + # Step 1: Initial task + self.conversation.send_message( + Message(role="user", content=[TextContent(text="First, create a file")]) + ) + self.conversation.run() + + # Intermediate verification + assert os.path.exists(os.path.join(self.workspace, "file.txt")) + + # Step 2: Follow-up task + self.conversation.send_message( + Message(role="user", content=[TextContent(text="Now modify the file")]) + ) + self.conversation.run() +``` + +See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`. diff --git a/tests/integration/base.py b/tests/integration/base.py index c633d367af..7ce4ee5c3b 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -2,6 +2,7 @@ Base classes for agent-sdk integration tests. """ +import json import os import sys from abc import ABC, abstractmethod @@ -102,6 +103,11 @@ def __init__( self.workspace, f"{self.instance_id}_agent_logs.txt" ) + # Create LLM messages file path for this test instance + self.llm_messages_file_path: str = os.path.join( + self.workspace, f"{self.instance_id}_llm_messages.json" + ) + # Early stopping support - must be initialized BEFORE LocalConversation # since the callback may access these attributes self.early_stopper: EarlyStopperBase | None = None @@ -128,6 +134,40 @@ def conversation_callback(self, event: Event): self.early_stop_result = result self.conversation.pause() # Trigger graceful stop + def run_conversation(self) -> None: + """ + Execute the conversation with the agent. + + Override this method to customize the conversation flow (e.g., multiple steps, + intermediate verification, or custom message sequences). The default implementation + sends a single instruction and runs the conversation to completion. + + You have access to: + - self.conversation: LocalConversation instance to send messages and control flow + - self.instruction: The instruction string for the test + - self.collected_events: Events collected so far (via callback) + - self.llm_messages: LLM messages collected so far (via callback) + + Example override for multi-step test: + def run_conversation(self) -> None: + # Step 1 + self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) + self.conversation.run() + + # Intermediate verification + assert some_condition() + + # Step 2 + self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) + self.conversation.run() + """ + self.conversation.send_message( + message=Message( + role="user", content=[TextContent(text=self.instruction)] + ) + ) + self.conversation.run() + def run_instruction(self) -> TestResult: """ Run user instruction through the agent and verify results. @@ -149,12 +189,7 @@ def run_instruction(self) -> TestResult: stderr_buffer = StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - self.conversation.send_message( - message=Message( - role="user", content=[TextContent(text=self.instruction)] - ) - ) - self.conversation.run() + self.run_conversation() # Save captured output to log file captured_output = stdout_buffer.getvalue() @@ -176,6 +211,22 @@ def run_instruction(self) -> TestResult: if captured_errors: print(captured_errors, file=sys.stderr, end="") + # Save LLM messages if LLM_MESSAGES_DIR is set + llm_messages_dir = os.getenv("LLM_MESSAGES_DIR") + if llm_messages_dir: + # Create directory if it doesn't exist + os.makedirs(llm_messages_dir, exist_ok=True) + + # Save LLM messages to JSON file in the specified directory + llm_messages_path = os.path.join( + llm_messages_dir, f"{self.instance_id}_llm_messages.json" + ) + with open(llm_messages_path, "w") as f: + json.dump(self.llm_messages, f, indent=2) + + # Update the file path so it can be accessed by run_infer.py + self.llm_messages_file_path = llm_messages_path + # Check if early stopped - skip full verification if self.early_stop_result: return TestResult( diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 8deb07568c..5587c3331c 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -52,6 +52,7 @@ class EvalOutput(BaseModel): token_usage: TokenUsageData | None = None error_message: str | None = None log_file_path: str | None = None + llm_messages_file_path: str | None = None @property def required(self) -> bool: @@ -216,6 +217,35 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval permanent_log_path, ) + # Copy LLM messages file to a location that will be preserved (if it exists) + llm_messages_file_path = None + if hasattr(test_instance, "llm_messages_file_path") and os.path.exists( + test_instance.llm_messages_file_path + ): + # Create a permanent logs directory if not already created + permanent_logs_dir = os.path.join(os.getcwd(), "integration_test_logs") + os.makedirs(permanent_logs_dir, exist_ok=True) + + # Create a unique filename to avoid conflicts + permanent_llm_messages_filename = ( + f"{instance.instance_id}_llm_messages.json" + ) + permanent_llm_messages_path = os.path.join( + permanent_logs_dir, permanent_llm_messages_filename + ) + + # Copy the LLM messages file + shutil.copy2( + test_instance.llm_messages_file_path, permanent_llm_messages_path + ) + llm_messages_file_path = permanent_llm_messages_path + + logger.info( + "Preserved LLM messages file for %s at %s", + instance.instance_id, + permanent_llm_messages_path, + ) + return EvalOutput( instance_id=instance.instance_id, test_result=test_result, @@ -224,6 +254,7 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval cost=llm_cost, token_usage=eval_token_usage, log_file_path=log_file_path, + llm_messages_file_path=llm_messages_file_path, ) except SkipTest as e: @@ -342,6 +373,19 @@ def generate_structured_results( eval_output.log_file_path, ) + # Also copy LLM messages file if it exists + if eval_output.llm_messages_file_path and os.path.exists( + eval_output.llm_messages_file_path + ): + llm_messages_filename = f"{eval_output.instance_id}_llm_messages.json" + dest_path = os.path.join(logs_dir, llm_messages_filename) + shutil.copy2(eval_output.llm_messages_file_path, dest_path) + logger.info( + "Copied LLM messages file for %s to %s", + eval_output.instance_id, + dest_path, + ) + # Print summary for console output success_rate = structured_results.success_rate successful = structured_results.successful_tests From 06563293982f2ec3b4e12db64ef4f05b57a71921 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Mon, 5 Jan 2026 16:23:57 -0700 Subject: [PATCH 2/8] renaming local vars and removing ref to unused test --- tests/integration/README.md | 17 +++++++---------- tests/integration/base.py | 23 +++++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/integration/README.md b/tests/integration/README.md index 7f2015b535..27ae0a2e80 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -103,7 +103,6 @@ These tests must pass for releases and verify that the agent can successfully co - **t07_interactive_commands** - Tests interactive command handling - **t08_image_file_viewing** - Tests image file viewing capabilities - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded -- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing ### Behavior Tests (`b*.py`) - **Optional** @@ -134,10 +133,10 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base ### Optional Methods -- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5) +- **`run_conversation(conversation)`** - Execute the conversation with the agent (new in v1.7.5) - Override this method to customize conversation flow for multi-step tests - Default implementation sends a single instruction and runs to completion - - Provides access to `self.conversation` for direct manipulation + - Receives the conversation object as a parameter for cleaner API - Example use cases: - Send multiple messages in sequence - Verify intermediate state between conversation phases @@ -148,21 +147,19 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base ```python class MultiStepTest(BaseIntegrationTest): - def run_conversation(self) -> None: + def run_conversation(self, conversation: LocalConversation) -> None: # Step 1: Initial task - self.conversation.send_message( + conversation.send_message( Message(role="user", content=[TextContent(text="First, create a file")]) ) - self.conversation.run() + conversation.run() # Intermediate verification assert os.path.exists(os.path.join(self.workspace, "file.txt")) # Step 2: Follow-up task - self.conversation.send_message( + conversation.send_message( Message(role="user", content=[TextContent(text="Now modify the file")]) ) - self.conversation.run() + conversation.run() ``` - -See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`. diff --git a/tests/integration/base.py b/tests/integration/base.py index 7ce4ee5c3b..13b19ca7ee 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -134,7 +134,7 @@ def conversation_callback(self, event: Event): self.early_stop_result = result self.conversation.pause() # Trigger graceful stop - def run_conversation(self) -> None: + def run_conversation(self, conversation: LocalConversation) -> None: """ Execute the conversation with the agent. @@ -142,31 +142,34 @@ def run_conversation(self) -> None: intermediate verification, or custom message sequences). The default implementation sends a single instruction and runs the conversation to completion. + Args: + conversation: The LocalConversation instance to send messages and control flow + You have access to: - - self.conversation: LocalConversation instance to send messages and control flow + - conversation: LocalConversation parameter to send messages and control flow - self.instruction: The instruction string for the test - self.collected_events: Events collected so far (via callback) - self.llm_messages: LLM messages collected so far (via callback) Example override for multi-step test: - def run_conversation(self) -> None: + def run_conversation(self, conversation: LocalConversation) -> None: # Step 1 - self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) - self.conversation.run() + conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) + conversation.run() # Intermediate verification assert some_condition() # Step 2 - self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) - self.conversation.run() + conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) + conversation.run() """ - self.conversation.send_message( + conversation.send_message( message=Message( role="user", content=[TextContent(text=self.instruction)] ) ) - self.conversation.run() + conversation.run() def run_instruction(self) -> TestResult: """ @@ -189,7 +192,7 @@ def run_instruction(self) -> TestResult: stderr_buffer = StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - self.run_conversation() + self.run_conversation(self.conversation) # Save captured output to log file captured_output = stdout_buffer.getvalue() From bbd17a1d0515abb4960ff02dea97e3c28dafdc65 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:02:06 -0700 Subject: [PATCH 3/8] Revert "renaming local vars and removing ref to unused test" This reverts commit 06563293982f2ec3b4e12db64ef4f05b57a71921. --- tests/integration/README.md | 17 ++++++++++------- tests/integration/base.py | 23 ++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/integration/README.md b/tests/integration/README.md index 27ae0a2e80..7f2015b535 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -103,6 +103,7 @@ These tests must pass for releases and verify that the agent can successfully co - **t07_interactive_commands** - Tests interactive command handling - **t08_image_file_viewing** - Tests image file viewing capabilities - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded +- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing ### Behavior Tests (`b*.py`) - **Optional** @@ -133,10 +134,10 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base ### Optional Methods -- **`run_conversation(conversation)`** - Execute the conversation with the agent (new in v1.7.5) +- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5) - Override this method to customize conversation flow for multi-step tests - Default implementation sends a single instruction and runs to completion - - Receives the conversation object as a parameter for cleaner API + - Provides access to `self.conversation` for direct manipulation - Example use cases: - Send multiple messages in sequence - Verify intermediate state between conversation phases @@ -147,19 +148,21 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base ```python class MultiStepTest(BaseIntegrationTest): - def run_conversation(self, conversation: LocalConversation) -> None: + def run_conversation(self) -> None: # Step 1: Initial task - conversation.send_message( + self.conversation.send_message( Message(role="user", content=[TextContent(text="First, create a file")]) ) - conversation.run() + self.conversation.run() # Intermediate verification assert os.path.exists(os.path.join(self.workspace, "file.txt")) # Step 2: Follow-up task - conversation.send_message( + self.conversation.send_message( Message(role="user", content=[TextContent(text="Now modify the file")]) ) - conversation.run() + self.conversation.run() ``` + +See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`. diff --git a/tests/integration/base.py b/tests/integration/base.py index 13b19ca7ee..7ce4ee5c3b 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -134,7 +134,7 @@ def conversation_callback(self, event: Event): self.early_stop_result = result self.conversation.pause() # Trigger graceful stop - def run_conversation(self, conversation: LocalConversation) -> None: + def run_conversation(self) -> None: """ Execute the conversation with the agent. @@ -142,34 +142,31 @@ def run_conversation(self, conversation: LocalConversation) -> None: intermediate verification, or custom message sequences). The default implementation sends a single instruction and runs the conversation to completion. - Args: - conversation: The LocalConversation instance to send messages and control flow - You have access to: - - conversation: LocalConversation parameter to send messages and control flow + - self.conversation: LocalConversation instance to send messages and control flow - self.instruction: The instruction string for the test - self.collected_events: Events collected so far (via callback) - self.llm_messages: LLM messages collected so far (via callback) Example override for multi-step test: - def run_conversation(self, conversation: LocalConversation) -> None: + def run_conversation(self) -> None: # Step 1 - conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) - conversation.run() + self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) + self.conversation.run() # Intermediate verification assert some_condition() # Step 2 - conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) - conversation.run() + self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) + self.conversation.run() """ - conversation.send_message( + self.conversation.send_message( message=Message( role="user", content=[TextContent(text=self.instruction)] ) ) - conversation.run() + self.conversation.run() def run_instruction(self) -> TestResult: """ @@ -192,7 +189,7 @@ def run_instruction(self) -> TestResult: stderr_buffer = StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - self.run_conversation(self.conversation) + self.run_conversation() # Save captured output to log file captured_output = stdout_buffer.getvalue() From f79e6645e70f750c89c52421ed0da0a04854c0e0 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:02:08 -0700 Subject: [PATCH 4/8] Revert "initial" This reverts commit 6754fbbfea66bcc4dd171bb719d7b939200925cd. --- tests/integration/README.md | 60 -------------------------------- tests/integration/base.py | 63 ++++------------------------------ tests/integration/run_infer.py | 44 ------------------------ 3 files changed, 6 insertions(+), 161 deletions(-) diff --git a/tests/integration/README.md b/tests/integration/README.md index 7f2015b535..b46722aaa5 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -50,32 +50,8 @@ uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_pr # Run a specific test uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' --eval-ids t01_fix_simple_typo - -# Run with LLM message recording (for debugging) -LLM_MESSAGES_DIR=./llm_messages uv run python tests/integration/run_infer.py --llm-config '{"model": "litellm_proxy/anthropic/claude-sonnet-4-5-20250929"}' -``` - -### Recording LLM Completions - -For debugging and analysis, you can record all LLM request/response messages by setting the `LLM_MESSAGES_DIR` environment variable: - -```bash -# Record LLM messages to a specific directory -export LLM_MESSAGES_DIR=/path/to/store/messages -uv run python tests/integration/run_infer.py --llm-config '{"model": "..."}' ``` -When enabled, this feature: -- Saves all LLM messages from each test to JSON files (`{test_id}_llm_messages.json`) -- Stores files in the specified directory -- Copies them to the test output directory alongside logs -- Does not record anything if the environment variable is not set - -This is useful for: -- Debugging test failures by inspecting exact LLM inputs/outputs -- Analyzing agent behavior across multiple conversation turns -- Creating datasets for evaluation or training - ## Automated Testing with GitHub Actions The integration tests are automatically executed via GitHub Actions using the workflow defined in `.github/workflows/integration-runner.yml`. @@ -103,7 +79,6 @@ These tests must pass for releases and verify that the agent can successfully co - **t07_interactive_commands** - Tests interactive command handling - **t08_image_file_viewing** - Tests image file viewing capabilities - **t09_token_condenser** - Tests that token-based condensation works correctly by verifying `get_token_count()` triggers condensation when token limits are exceeded -- **t10_hard_context_reset** - Tests that the agent can continue working correctly after context condensation. Uses `run_conversation()` override for multi-phase testing ### Behavior Tests (`b*.py`) - **Optional** @@ -131,38 +106,3 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`) - Override to limit LLM calls for faster tests - Useful for tests that should complete quickly - -### Optional Methods - -- **`run_conversation()`** - Execute the conversation with the agent (new in v1.7.5) - - Override this method to customize conversation flow for multi-step tests - - Default implementation sends a single instruction and runs to completion - - Provides access to `self.conversation` for direct manipulation - - Example use cases: - - Send multiple messages in sequence - - Verify intermediate state between conversation phases - - Test complex multi-turn interactions - - Trigger condensation at specific points - -#### Example: Multi-Step Test - -```python -class MultiStepTest(BaseIntegrationTest): - def run_conversation(self) -> None: - # Step 1: Initial task - self.conversation.send_message( - Message(role="user", content=[TextContent(text="First, create a file")]) - ) - self.conversation.run() - - # Intermediate verification - assert os.path.exists(os.path.join(self.workspace, "file.txt")) - - # Step 2: Follow-up task - self.conversation.send_message( - Message(role="user", content=[TextContent(text="Now modify the file")]) - ) - self.conversation.run() -``` - -See `t10_hard_context_reset.py` for a real example of overriding `run_conversation()`. diff --git a/tests/integration/base.py b/tests/integration/base.py index 7ce4ee5c3b..c633d367af 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -2,7 +2,6 @@ Base classes for agent-sdk integration tests. """ -import json import os import sys from abc import ABC, abstractmethod @@ -103,11 +102,6 @@ def __init__( self.workspace, f"{self.instance_id}_agent_logs.txt" ) - # Create LLM messages file path for this test instance - self.llm_messages_file_path: str = os.path.join( - self.workspace, f"{self.instance_id}_llm_messages.json" - ) - # Early stopping support - must be initialized BEFORE LocalConversation # since the callback may access these attributes self.early_stopper: EarlyStopperBase | None = None @@ -134,40 +128,6 @@ def conversation_callback(self, event: Event): self.early_stop_result = result self.conversation.pause() # Trigger graceful stop - def run_conversation(self) -> None: - """ - Execute the conversation with the agent. - - Override this method to customize the conversation flow (e.g., multiple steps, - intermediate verification, or custom message sequences). The default implementation - sends a single instruction and runs the conversation to completion. - - You have access to: - - self.conversation: LocalConversation instance to send messages and control flow - - self.instruction: The instruction string for the test - - self.collected_events: Events collected so far (via callback) - - self.llm_messages: LLM messages collected so far (via callback) - - Example override for multi-step test: - def run_conversation(self) -> None: - # Step 1 - self.conversation.send_message(Message(role="user", content=[TextContent(text="First task")])) - self.conversation.run() - - # Intermediate verification - assert some_condition() - - # Step 2 - self.conversation.send_message(Message(role="user", content=[TextContent(text="Second task")])) - self.conversation.run() - """ - self.conversation.send_message( - message=Message( - role="user", content=[TextContent(text=self.instruction)] - ) - ) - self.conversation.run() - def run_instruction(self) -> TestResult: """ Run user instruction through the agent and verify results. @@ -189,7 +149,12 @@ def run_instruction(self) -> TestResult: stderr_buffer = StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - self.run_conversation() + self.conversation.send_message( + message=Message( + role="user", content=[TextContent(text=self.instruction)] + ) + ) + self.conversation.run() # Save captured output to log file captured_output = stdout_buffer.getvalue() @@ -211,22 +176,6 @@ def run_instruction(self) -> TestResult: if captured_errors: print(captured_errors, file=sys.stderr, end="") - # Save LLM messages if LLM_MESSAGES_DIR is set - llm_messages_dir = os.getenv("LLM_MESSAGES_DIR") - if llm_messages_dir: - # Create directory if it doesn't exist - os.makedirs(llm_messages_dir, exist_ok=True) - - # Save LLM messages to JSON file in the specified directory - llm_messages_path = os.path.join( - llm_messages_dir, f"{self.instance_id}_llm_messages.json" - ) - with open(llm_messages_path, "w") as f: - json.dump(self.llm_messages, f, indent=2) - - # Update the file path so it can be accessed by run_infer.py - self.llm_messages_file_path = llm_messages_path - # Check if early stopped - skip full verification if self.early_stop_result: return TestResult( diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 5587c3331c..8deb07568c 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -52,7 +52,6 @@ class EvalOutput(BaseModel): token_usage: TokenUsageData | None = None error_message: str | None = None log_file_path: str | None = None - llm_messages_file_path: str | None = None @property def required(self) -> bool: @@ -217,35 +216,6 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval permanent_log_path, ) - # Copy LLM messages file to a location that will be preserved (if it exists) - llm_messages_file_path = None - if hasattr(test_instance, "llm_messages_file_path") and os.path.exists( - test_instance.llm_messages_file_path - ): - # Create a permanent logs directory if not already created - permanent_logs_dir = os.path.join(os.getcwd(), "integration_test_logs") - os.makedirs(permanent_logs_dir, exist_ok=True) - - # Create a unique filename to avoid conflicts - permanent_llm_messages_filename = ( - f"{instance.instance_id}_llm_messages.json" - ) - permanent_llm_messages_path = os.path.join( - permanent_logs_dir, permanent_llm_messages_filename - ) - - # Copy the LLM messages file - shutil.copy2( - test_instance.llm_messages_file_path, permanent_llm_messages_path - ) - llm_messages_file_path = permanent_llm_messages_path - - logger.info( - "Preserved LLM messages file for %s at %s", - instance.instance_id, - permanent_llm_messages_path, - ) - return EvalOutput( instance_id=instance.instance_id, test_result=test_result, @@ -254,7 +224,6 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval cost=llm_cost, token_usage=eval_token_usage, log_file_path=log_file_path, - llm_messages_file_path=llm_messages_file_path, ) except SkipTest as e: @@ -373,19 +342,6 @@ def generate_structured_results( eval_output.log_file_path, ) - # Also copy LLM messages file if it exists - if eval_output.llm_messages_file_path and os.path.exists( - eval_output.llm_messages_file_path - ): - llm_messages_filename = f"{eval_output.instance_id}_llm_messages.json" - dest_path = os.path.join(logs_dir, llm_messages_filename) - shutil.copy2(eval_output.llm_messages_file_path, dest_path) - logger.info( - "Copied LLM messages file for %s to %s", - eval_output.instance_id, - dest_path, - ) - # Print summary for console output success_rate = structured_results.success_rate successful = structured_results.successful_tests From 6766c78c1a6a85b398afd0b7f300adc9572fbd0c Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:12:08 -0700 Subject: [PATCH 5/8] run_instruction broken out to support conversation manipulation --- tests/integration/base.py | 18 +++++++++++------- tests/integration/run_infer.py | 2 +- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/integration/base.py b/tests/integration/base.py index c633d367af..9ba21164f6 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -128,7 +128,7 @@ def conversation_callback(self, event: Event): self.early_stop_result = result self.conversation.pause() # Trigger graceful stop - def run_instruction(self) -> TestResult: + def run_integration_test(self) -> TestResult: """ Run user instruction through the agent and verify results. @@ -149,12 +149,7 @@ def run_instruction(self) -> TestResult: stderr_buffer = StringIO() with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer): - self.conversation.send_message( - message=Message( - role="user", content=[TextContent(text=self.instruction)] - ) - ) - self.conversation.run() + self.run_instructions(self.conversation) # Save captured output to log file captured_output = stdout_buffer.getvalue() @@ -194,6 +189,15 @@ def run_instruction(self) -> TestResult: finally: self.teardown() + def run_instructions(self, conversation: LocalConversation) -> None: + """Feed user instructions to the agent and manage the conversation.""" + conversation.send_message( + message=Message( + role="user", content=[TextContent(text=self.instruction)] + ) + ) + conversation.run() + @property @abstractmethod def tools(self) -> list[Tool]: diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 8deb07568c..3000e568ce 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -148,7 +148,7 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval # Run the test start_time = time.time() - test_result = test_instance.run_instruction() + test_result = test_instance.run_integration_test() end_time = time.time() # Access accumulated_cost from the metrics object where it's properly validated From aadef4040ef272c5b245aec62c69f6fb36fb1479 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:19:45 -0700 Subject: [PATCH 6/8] minor readme --- tests/integration/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/README.md b/tests/integration/README.md index b46722aaa5..322c7475ec 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -106,3 +106,9 @@ All integration tests inherit from `BaseIntegrationTest` in `base.py`. The base - **`max_iteration_per_run`** (property) - Maximum iterations per conversation (default: `100`) - Override to limit LLM calls for faster tests - Useful for tests that should complete quickly + +### Conversation Control + +The standard way to define an integration test is to set the `INSTRUCTION` class variable. These instructions are sent to the agent as the first user message. + +However, if the functionality being tested requires multiple instructions or accessing the conversation object mid-test then the test can instead be defined by overriding the `run_instructions` method. This method provides a `LocalConversation` object that can be manipulated directly by sending messages, triggering condensations, and the like. \ No newline at end of file From e1a357e1e8cc466ff47167bee2ebe857f1e2f76c Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:28:58 -0700 Subject: [PATCH 7/8] instruction message property --- tests/integration/base.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/integration/base.py b/tests/integration/base.py index 9ba21164f6..67a22cf770 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -191,13 +191,16 @@ def run_integration_test(self) -> TestResult: def run_instructions(self, conversation: LocalConversation) -> None: """Feed user instructions to the agent and manage the conversation.""" - conversation.send_message( - message=Message( - role="user", content=[TextContent(text=self.instruction)] - ) - ) + conversation.send_message(message=self.instruction_message) conversation.run() + @property + def instruction_message(self) -> Message: + """The initial instruction message for the agent.""" + return Message( + role="user", content=[TextContent(text=self.instruction)] + ) + @property @abstractmethod def tools(self) -> list[Tool]: From d3aee66cf7ea8e893df4caa8691c3d90ffc52a39 Mon Sep 17 00:00:00 2001 From: Calvin Smith Date: Tue, 6 Jan 2026 08:42:02 -0700 Subject: [PATCH 8/8] linting --- tests/integration/base.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/base.py b/tests/integration/base.py index 67a22cf770..6f7a759aaf 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -190,16 +190,14 @@ def run_integration_test(self) -> TestResult: self.teardown() def run_instructions(self, conversation: LocalConversation) -> None: - """Feed user instructions to the agent and manage the conversation.""" + """Feed user instructions to the agent and manage the conversation.""" conversation.send_message(message=self.instruction_message) conversation.run() @property def instruction_message(self) -> Message: """The initial instruction message for the agent.""" - return Message( - role="user", content=[TextContent(text=self.instruction)] - ) + return Message(role="user", content=[TextContent(text=self.instruction)]) @property @abstractmethod