Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions tests/integration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,17 @@ def conversation_callback(self, event: Event):
self.early_stop_result = result
self.conversation.pause() # Trigger graceful stop

def execute_conversation(self) -> None:
"""
Execute the conversation flow. Override for custom multi-message flows.

Default implementation sends the instruction and runs the conversation once.
"""
self.conversation.send_message(
message=Message(role="user", content=[TextContent(text=self.instruction)])
)
self.conversation.run()

def run_instruction(self) -> TestResult:
"""
Run user instruction through the agent and verify results.
Expand All @@ -149,12 +160,7 @@ def run_instruction(self) -> TestResult:
stderr_buffer = StringIO()

with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
self.conversation.send_message(
message=Message(
role="user", content=[TextContent(text=self.instruction)]
)
)
self.conversation.run()
self.execute_conversation()

# Save captured output to log file
captured_output = stdout_buffer.getvalue()
Expand Down
296 changes: 296 additions & 0 deletions tests/integration/tests/t10_condensation_thinking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
"""Test condensation with thinking blocks.

Verifies that conversation continues successfully after condensation when
thinking blocks are forgotten. Tests that signature verification works correctly
when earlier thinking blocks are condensed away while later ones remain.
"""

from itertools import chain

from openhands.sdk import get_logger
from openhands.sdk.context.condenser import CondenserBase
from openhands.sdk.context.view import View
from openhands.sdk.event import ActionEvent
from openhands.sdk.event.condenser import Condensation
from openhands.sdk.llm import LLM
from openhands.sdk.llm.utils.model_features import get_features
from openhands.sdk.tool import Tool, register_tool
from openhands.tools.terminal import TerminalTool
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult


# Multi-step task requiring multiple tool calls with reasoning
INSTRUCTION = """
Solve this multi-step problem using bash commands. Show your reasoning for each step:

1. Calculate compound interest for $10,000 at 5% annually, compounded quarterly for 3
years:
A = P(1 + r/n)^(nt) where P=10000, r=0.05, n=4, t=3

2. Calculate simple interest for the same amount:
A = P(1 + rt) where P=10000, r=0.05, t=3

3. Calculate the difference between compound and simple interest

4. Write each intermediate result to separate files:
- compound.txt: compound interest result
- simple.txt: simple interest result
- difference.txt: the difference

Use the 'bc' calculator command for each calculation. You can use echo and pipes to
calculate.

For example: echo "scale=2; 10000 * 1.05" | bc
"""

logger = get_logger(__name__)


class FirstToolLoopCondenser(CondenserBase):
"""Custom condenser that forgets the first tool loop on request.

This condenser handles CondensationRequest events and forgets the first
tool loop (identified via manipulation_indices), replacing it with a static summary.
"""

def handles_condensation_requests(self) -> bool:
"""This condenser handles explicit condensation requests."""
return True

def condense(self, view: View, agent_llm: LLM | None = None) -> View | Condensation:
"""Condense by forgetting the first tool loop with a static summary.

Args:
view: The current view of the conversation
agent_llm: The LLM instance (unused but required by interface)

Returns:
Condensation event that forgets the first tool loop
"""
# Only condense if there's an unhandled condensation request
if not view.unhandled_condensation_request:
return view

# Get manipulation indices to identify tool loop boundaries
indices = view.manipulation_indices

# We need at least 2 atomic units with thinking blocks to condense the first one
if len(indices) < 3:
return view

# Find the first atomic unit that contains thinking blocks
first_thinking_block_unit_idx = None
for i in range(len(indices) - 1):
start_idx = indices[i]
end_idx = indices[i + 1]

# Check if this atomic unit has any events with thinking blocks
for event in view.events[start_idx:end_idx]:
if isinstance(event, ActionEvent) and event.thinking_blocks:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! I appreciate this, I do wonder though if it belongs as integration test, or maybe something else.

thinking_blocks are an Anthropic Claudes' parameter, Gemini 3 has this but they go by different rules (and different from Gemini 2.5 too 😅 ...); Minimax I think has them too. And I can't think of another.

Since it feels very LLM specific, I wonder if maybe a script in scripts/ would work? Alternatively, we have now an examples/ directory named llm_specific/ ... None seems great, idk, WDYT?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It probably is too LLM specific to be an integration test. The scaffold is just convenient for condenser tests -- repeatable, no mocked objects, arbitrary LLMs configured from outside the test, etc.

What if we made it like the behavior tests? Using the same infra as the integration tests, but triggered separately and non-blocking. This would basically become c01_test... instead.

Would be good to have some batch of tests that stress the condenser across multiple LLMs we can run on occasion to make sure behavior isn't drifting.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually just have a somewhat similar problem: we want an integration test for conversation restore, where the user does a few actions with an LLM, exit and change to another LLM, then restore the conversation.

The structure of integration tests is almost okay, except that it is actually unnecessary to run all 6 LLMs or so, the test doesn't really depend on them. It just seems to me a bit of a waste, it matters that it "really" works, with real LLMs picking up and continuing the conversation, but it doesn't matter for all matrix. Maybe it matters for more than a pair (we might want a reasoning LLM paired with a non-reasoning one, for fun), but those are... different rules to choose LLM than the current matrix.

I'd call it similar with this, in the sense where thinking_blocks are under test, so we know exactly which LLM(s) to run it for, and not run it for others? If that's possible within the integration-test framework... for C01, C02, ...

first_thinking_block_unit_idx = i
break

if first_thinking_block_unit_idx is not None:
break

if first_thinking_block_unit_idx is None:
# No thinking blocks found, can't condense
return view

# Find the second atomic unit with thinking blocks
second_thinking_block_unit_idx = None
for i in range(first_thinking_block_unit_idx + 1, len(indices) - 1):
start_idx = indices[i]
end_idx = indices[i + 1]

# Check if this atomic unit has any events with thinking blocks
for event in view.events[start_idx:end_idx]:
if isinstance(event, ActionEvent) and event.thinking_blocks:
second_thinking_block_unit_idx = i
break

if second_thinking_block_unit_idx is not None:
break

if second_thinking_block_unit_idx is None:
# Only one tool loop with thinking blocks, can't condense safely
return view

# Forget everything from the start up to and including the first tool loop
# This is from indices[0] to indices[first_thinking_block_unit_idx + 1]
first_loop_end = indices[first_thinking_block_unit_idx + 1]

# Collect event IDs to forget (everything up to and including first tool loop)
forgotten_event_ids = [
event.id for event in view.events[:first_loop_end]
]

# Create a static summary
summary = (
"Previous calculations completed: compound interest formula applied "
"to calculate A = P(1 + r/n)^(nt), simple interest calculated, "
"and results written to files."
)

# Return condensation event
# Get the llm_response_id from the last event before condensation
# (we need this for the Condensation event)
last_event = view.events[-1]
if isinstance(last_event, ActionEvent):
llm_response_id = last_event.llm_response_id
else:
# Find the most recent ActionEvent
llm_response_id = None
for event in reversed(view.events):
if isinstance(event, ActionEvent):
llm_response_id = event.llm_response_id
break
if llm_response_id is None:
# Fallback: just return the view if we can't find an llm_response_id
return view

return Condensation(
forgotten_event_ids=forgotten_event_ids,
summary=summary,
summary_offset=0, # Insert summary at the beginning
llm_response_id=llm_response_id,
)


class CondensationThinkingTest(BaseIntegrationTest):
"""Test condensation with thinking blocks."""

INSTRUCTION: str = INSTRUCTION

def __init__(self, *args, **kwargs):
self.thinking_block_events = []
self.condensations = []

super().__init__(*args, **kwargs)

# Grab the model name so we can check features
model_name = self.llm.model
canonical = self.llm.model_info.get("model") if self.llm.model_info else None
name = canonical or model_name or "unknown"

features = get_features(name)

# This test only works for models that generate thinking blocks, so skip
# otherwise
supports_thinking = (
features.supports_extended_thinking or features.supports_reasoning_effort
)
if not supports_thinking:
raise SkipTest(
f"Model '{name}' does not support extended thinking or reasoning effort"
)

@property
def tools(self) -> list[Tool]:
register_tool("TerminalTool", TerminalTool)
return [Tool(name="TerminalTool")]

@property
def condenser(self) -> FirstToolLoopCondenser:
"""Configure custom condenser that forgets first tool loop on request.

This condenser handles CondensationRequest events and replaces the first
tool loop with a static summary, ensuring thinking blocks from that loop
are forgotten.
"""
return FirstToolLoopCondenser()

@property
def max_iteration_per_run(self) -> int:
return 30

def conversation_callback(self, event):
super().conversation_callback(event)

if isinstance(event, ActionEvent) and event.thinking_blocks:
self.thinking_block_events.append(event)

if isinstance(event, Condensation):
self.condensations.append(event)

def setup(self) -> None:
pass

def verify_result(self) -> TestResult:
# Sanity checks to ensure the flow worked the way we expected: at least three
# thinking blocks and one condensation
if len(self.thinking_block_events) < 3:
return TestResult(
success=False,
reason=(
f"Expected at least 3 thinking blocks, got "
f"{len(self.thinking_block_events)}"
),
)

if len(self.condensations) != 1:
return TestResult(
success=False,
reason=(
f"Expected exactly 1 condensation event, got "
f"{len(self.condensations)}"
),
)

# Check the conditions of the actual test. The condensation should forget the
# first thinking block but keep the second.
forgotten_event_ids = list(chain.from_iterable(
c.forgotten_event_ids for c in self.condensations
))

if self.thinking_block_events[0].id not in forgotten_event_ids:
return TestResult(
success=False,
reason="First thinking block was not forgotten during condensation",
)

if self.thinking_block_events[1].id in forgotten_event_ids:
return TestResult(
success=False,
reason=(
"Second thinking block was incorrectly forgotten during "
"condensation"
),
)

# In all other cases, if the test was successfully run without errors then it's
# a pass
return TestResult(
success=True,
reason=(
f"Condensation successful with {len(self.thinking_block_events)} "
"thinking blocks"
),
)

def execute_conversation(self) -> None:
"""Custom conversation flow with follow-up messages and manual condensation."""
# Send the instruction and run until completion
self.conversation.send_message(self.INSTRUCTION)
self.conversation.run()

# The agent should resolve the task with a single tool loop and just one
# thinking block

self.conversation.send_message(
"Now verify your calculations are correct by re-running the commands "
"and comparing the results. Show your reasoning about whether the "
"calculations match."
)
self.conversation.run()

# Manually trigger condensation
self.conversation.condense()
self.conversation.run()

# Send one more message to see if conversation can continue
self.conversation.send_message(
"Now run the commands again with 10% interest rate compounded annually."
" Show your reasoning and explain which method will yield a larger return."
)
self.conversation.run()
Loading