Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/integration-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ jobs:
run-suffix: sonnet_run
llm-config:
model: litellm_proxy/claude-sonnet-4-5-20250929
restore-llm-config-2:
model: litellm_proxy/gpt-5.1-codex-max
max_input_tokens: 100000
- name: GPT-5.1 Codex Max
run-suffix: gpt51_codex_run
llm-config:
Expand Down Expand Up @@ -160,6 +163,34 @@ jobs:
fi
echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"


- name: Run restore conversation test once (two real LLMs)
if: ${{ matrix.job-config.run-suffix == 'sonnet_run' && env.TEST_TYPE_ARGS == '--test-type integration' }}
env:
LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
RESTORE_LLM_CONFIG_2: ${{ toJson(matrix.job-config.restore-llm-config-2) }}
LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
LLM_BASE_URL: https://llm-proxy.app.all-hands.dev
run: |
set -eo pipefail

AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
EVAL_NOTE="${AGENT_SDK_VERSION}_t10_restore_once"

uv run python tests/integration/run_infer.py \
--llm-config "$LLM_CONFIG" \
--num-workers 1 \
--eval-note "$EVAL_NOTE" \
--eval-ids t10_restore_conversation \
--test-type integration



- name: Skip t10 in the matrix runs (it runs once above)
if: ${{ env.TEST_TYPE_ARGS == '--test-type integration' }}
run: echo "SKIP_T10=1" >> "$GITHUB_ENV"


- name: Run integration test evaluation for ${{ matrix.job-config.name }}
env:
LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
Expand Down
10 changes: 10 additions & 0 deletions tests/integration/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,10 +441,20 @@ def main():
# Load all integration tests
instances = load_integration_tests()

# Special-case: some integration tests require dedicated CI setup.
# Example: t10_restore_conversation needs two real LLM configs.
#
# Note: filter by test_type FIRST so that behavior-only runs don't set SKIP_T10
# and accidentally skip nothing.
if args.test_type != "all":
instances = [inst for inst in instances if inst.test_type == args.test_type]
logger.info("Filtered to %d %s tests", len(instances), args.test_type)

if os.environ.get("SKIP_T10") == "1" and args.test_type == "integration":
instances = [
inst for inst in instances if inst.instance_id != "t10_restore_conversation"
]

# Filter by specific test IDs if provided
if args.eval_ids:
eval_ids = [id.strip() for id in args.eval_ids.split(",")]
Expand Down
237 changes: 237 additions & 0 deletions tests/integration/tests/t10_restore_conversation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
"""Test conversation restore (resume) behavior.

This integration test exercises the key behavior of PR #1542:
- On resume, we use the runtime-provided Agent.
- Tool compatibility is verified (tools used in history must still exist).
- Conversation-state settings are restored from persistence (e.g.
confirmation_policy, execution_status).

Note: This test does not require the agent to take any actions; it verifies the
resume semantics directly.
"""

from __future__ import annotations

import json
import os

from openhands.sdk.agent import Agent
from openhands.sdk.conversation.impl.local_conversation import LocalConversation
from openhands.sdk.conversation.state import (
ConversationExecutionStatus,
)
from openhands.sdk.llm import LLM
from openhands.sdk.security.confirmation_policy import AlwaysConfirm
from openhands.sdk.tool import Tool, register_tool
from openhands.tools.terminal import TerminalTool
from tests.integration.base import BaseIntegrationTest, TestResult


INSTRUCTION = "Create a new conversation."


class RestoreConversationTest(BaseIntegrationTest):
"""Ensure resume restores persisted state but uses runtime Agent configuration."""

INSTRUCTION: str = INSTRUCTION

@property
def tools(self) -> list[Tool]:
register_tool("TerminalTool", TerminalTool)
return [Tool(name="TerminalTool")]

def setup(self) -> None:
# We want persistence in the integration test workspace.
# Keep persisted conversations somewhere easy to inspect locally.
# This is intentionally outside the ephemeral runner workspace.
self.persistence_dir = os.path.join(
os.getcwd(), "tests", "integration", "outputs", "local_persist_t10"
)
os.makedirs(self.persistence_dir, exist_ok=True)

def verify_result(self) -> TestResult:
# First run: create conversation with agent1.
# Use the runner-provided LLM config for llm1.
llm1 = LLM(
model=self.llm.model,
base_url=self.llm.base_url,
api_key=self.llm.api_key,
usage_id="restore-test-llm-1",
max_input_tokens=self.llm.max_input_tokens,
)
agent1 = Agent(llm=llm1, tools=self.tools)

conv1 = LocalConversation(
agent=agent1,
workspace=self.workspace,
persistence_dir=self.persistence_dir,
visualizer=None,
)

# Persisted state settings (should be restored from persistence on resume)
conv1.state.confirmation_policy = AlwaysConfirm()
conv1.state.execution_status = ConversationExecutionStatus.ERROR

# Ensure there's at least one user + assistant message pair in history.
# This exercises the full create -> persist -> resume path with events.
conv1.send_message(INSTRUCTION)
conv1.run()

conversation_id = conv1.id
conv1_event_count = len(conv1.state.events)
print(f"[t10] conv1 persisted events: {conv1_event_count}")

# Read persisted base_state.json and ensure it contains the original model.
# LocalConversation persists to:
# <persistence_dir>/<conversation_id.hex>/base_state.json
base_state_path = os.path.join(
self.persistence_dir, conversation_id.hex, "base_state.json"
)
if not os.path.exists(base_state_path):
return TestResult(
success=False,
reason=(
f"Expected persisted base_state.json not found at {base_state_path}"
),
)

with open(base_state_path) as f:
base_state = json.load(f)

persisted_llm = base_state.get("agent", {}).get("llm", {})
persisted_model = persisted_llm.get("model")
persisted_max_input_tokens = persisted_llm.get("max_input_tokens")
persisted_usage_id = persisted_llm.get("usage_id")

if persisted_model != llm1.model:
return TestResult(
success=False,
reason=(
"Expected persisted agent.llm.model to match runtime llm1.model, "
f"got {persisted_model!r} (expected {llm1.model!r})"
),
)

if persisted_max_input_tokens != llm1.max_input_tokens:
return TestResult(
success=False,
reason=(
"Expected persisted agent.llm.max_input_tokens to match runtime "
f"llm1.max_input_tokens={llm1.max_input_tokens!r}, got "
f"{persisted_max_input_tokens!r}"
),
)

if persisted_usage_id != "restore-test-llm-1":
return TestResult(
success=False,
reason=(
"Expected persisted agent.llm.usage_id to be 'restore-test-llm-1', "
f"got {persisted_usage_id!r}"
),
)

del conv1

# Resume: provide a *different* runtime agent/LLM configuration.
# We load llm2 config from RESTORE_LLM_CONFIG_2 (JSON string), but always
# use the CI-provided base_url/api_key.
llm2_config_raw = os.environ.get("RESTORE_LLM_CONFIG_2")
if not llm2_config_raw:
return TestResult(
success=False,
reason="RESTORE_LLM_CONFIG_2 is required for t10_restore_conversation",
)

try:
llm2_config = json.loads(llm2_config_raw)
except json.JSONDecodeError as e:
return TestResult(
success=False,
reason=f"RESTORE_LLM_CONFIG_2 is not valid JSON: {e}",
)

llm2 = LLM(
model=llm2_config["model"],
base_url=self.llm.base_url,
api_key=self.llm.api_key,
usage_id="restore-test-llm-2",
max_input_tokens=llm2_config.get("max_input_tokens"),
)
agent2 = Agent(llm=llm2, tools=self.tools)

conv2 = LocalConversation(
agent=agent2,
workspace=self.workspace,
persistence_dir=self.persistence_dir,
conversation_id=conversation_id,
visualizer=None,
)

conv2_event_count = len(conv2.state.events)
print(f"[t10] conv2 loaded events: {conv2_event_count}")
if conv2_event_count != conv1_event_count:
return TestResult(
success=False,
reason=(
"Event count mismatch after restore: "
f"before={conv1_event_count} after={conv2_event_count}"
),
)

# 1) Persisted state settings should be restored on resume.
if not conv2.state.confirmation_policy.should_confirm():
return TestResult(
success=False,
reason="confirmation_policy was not restored from persistence",
)

# The restored conversation should be in a normal resumable state.
# We expect it to have reached FINISHED after the initial run.
if conv2.state.execution_status != ConversationExecutionStatus.FINISHED:
return TestResult(
success=False,
reason=(
"Expected execution_status=FINISHED after restore, got "
f"{conv2.state.execution_status!r}"
),
)

# Prove the restored conversation can continue.
conv2.state.execution_status = ConversationExecutionStatus.ERROR
conv2.send_message("are you still there?")
conv2.run()

# After a successful run, we should not remain in an error state.
if conv2.state.execution_status == ConversationExecutionStatus.ERROR:
return TestResult(
success=False,
reason=(
"Expected restored conversation to make progress after a new "
"user message, but execution_status is still ERROR."
),
)

# 2) Runtime agent/LLM should be used.
if conv2.agent.llm.model != llm2.model:
return TestResult(
success=False,
reason=(
"Expected runtime agent llm.model to match llm2.model after "
f"resume, got {conv2.agent.llm.model!r} (expected {llm2.model!r})"
),
)
if (
llm2.max_input_tokens is not None
and conv2.agent.llm.max_input_tokens != llm2.max_input_tokens
):
return TestResult(
success=False,
reason=(
"Expected runtime max_input_tokens to match llm2.max_input_tokens "
f"after resume, got {conv2.agent.llm.max_input_tokens!r} "
f"(expected {llm2.max_input_tokens!r})"
),
)

return TestResult(success=True, reason="Restore semantics verified")
Loading