OpenHands · enyst · Jan 6, 2026 · Jan 6, 2026 · Jan 7, 2026
diff --git a/.github/workflows/integration-runner.yml b/.github/workflows/integration-runner.yml
@@ -80,6 +80,9 @@ jobs:
                       run-suffix: sonnet_run
                       llm-config:
                           model: litellm_proxy/claude-sonnet-4-5-20250929
+                      restore-llm-config-2:
+                          model: litellm_proxy/gpt-5.1-codex-max
+                          max_input_tokens: 100000
                     - name: GPT-5.1 Codex Max
                       run-suffix: gpt51_codex_run
                       llm-config:
@@ -160,6 +163,34 @@ jobs:
                   fi
                   echo "TEST_TYPE_ARGS=$TEST_TYPE_ARGS" >> "$GITHUB_ENV"
 
+
+            - name: Run restore conversation test once (two real LLMs)
+              if: ${{ matrix.job-config.run-suffix == 'sonnet_run' && env.TEST_TYPE_ARGS == '--test-type integration' }}
+              env:
+                  LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}
+                  RESTORE_LLM_CONFIG_2: ${{ toJson(matrix.job-config.restore-llm-config-2) }}
+                  LLM_API_KEY: ${{ secrets.LLM_API_KEY }}
+                  LLM_BASE_URL: https://llm-proxy.app.all-hands.dev
+              run: |
+                  set -eo pipefail
+
+                  AGENT_SDK_VERSION=$(git rev-parse --short HEAD)
+                  EVAL_NOTE="${AGENT_SDK_VERSION}_t10_restore_once"
+
+                  uv run python tests/integration/run_infer.py \
+                    --llm-config "$LLM_CONFIG" \
+                    --num-workers 1 \
+                    --eval-note "$EVAL_NOTE" \
+                    --eval-ids t10_restore_conversation \
+                    --test-type integration
+
+
+
+            - name: Skip t10 in the matrix runs (it runs once above)
+              if: ${{ env.TEST_TYPE_ARGS == '--test-type integration' }}
+              run: echo "SKIP_T10=1" >> "$GITHUB_ENV"
+
+
             - name: Run integration test evaluation for ${{ matrix.job-config.name }}
               env:
                   LLM_CONFIG: ${{ toJson(matrix.job-config.llm-config) }}

diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
@@ -441,10 +441,20 @@ def main():
     # Load all integration tests
     instances = load_integration_tests()
 
+    # Special-case: some integration tests require dedicated CI setup.
+    # Example: t10_restore_conversation needs two real LLM configs.
+    #
+    # Note: filter by test_type FIRST so that behavior-only runs don't set SKIP_T10
+    # and accidentally skip nothing.
     if args.test_type != "all":
         instances = [inst for inst in instances if inst.test_type == args.test_type]
         logger.info("Filtered to %d %s tests", len(instances), args.test_type)
 
+    if os.environ.get("SKIP_T10") == "1" and args.test_type == "integration":
+        instances = [
+            inst for inst in instances if inst.instance_id != "t10_restore_conversation"
+        ]
+
     # Filter by specific test IDs if provided
     if args.eval_ids:
         eval_ids = [id.strip() for id in args.eval_ids.split(",")]

diff --git a/tests/integration/tests/t10_restore_conversation.py b/tests/integration/tests/t10_restore_conversation.py
@@ -0,0 +1,237 @@
+"""Test conversation restore (resume) behavior.
+
+This integration test exercises the key behavior of PR #1542:
+- On resume, we use the runtime-provided Agent.
+- Tool compatibility is verified (tools used in history must still exist).
+- Conversation-state settings are restored from persistence (e.g.
+  confirmation_policy, execution_status).
+
+Note: This test does not require the agent to take any actions; it verifies the
+resume semantics directly.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+
+from openhands.sdk.agent import Agent
+from openhands.sdk.conversation.impl.local_conversation import LocalConversation
+from openhands.sdk.conversation.state import (
+    ConversationExecutionStatus,
+)
+from openhands.sdk.llm import LLM
+from openhands.sdk.security.confirmation_policy import AlwaysConfirm
+from openhands.sdk.tool import Tool, register_tool
+from openhands.tools.terminal import TerminalTool
+from tests.integration.base import BaseIntegrationTest, TestResult
+
+
+INSTRUCTION = "Create a new conversation."
+
+
+class RestoreConversationTest(BaseIntegrationTest):
+    """Ensure resume restores persisted state but uses runtime Agent configuration."""
+
+    INSTRUCTION: str = INSTRUCTION
+
+    @property
+    def tools(self) -> list[Tool]:
+        register_tool("TerminalTool", TerminalTool)
+        return [Tool(name="TerminalTool")]
+
+    def setup(self) -> None:
+        # We want persistence in the integration test workspace.
+        # Keep persisted conversations somewhere easy to inspect locally.
+        # This is intentionally outside the ephemeral runner workspace.
+        self.persistence_dir = os.path.join(
+            os.getcwd(), "tests", "integration", "outputs", "local_persist_t10"
+        )
+        os.makedirs(self.persistence_dir, exist_ok=True)
+
+    def verify_result(self) -> TestResult:
+        # First run: create conversation with agent1.
+        # Use the runner-provided LLM config for llm1.
+        llm1 = LLM(
+            model=self.llm.model,
+            base_url=self.llm.base_url,
+            api_key=self.llm.api_key,
+            usage_id="restore-test-llm-1",
+            max_input_tokens=self.llm.max_input_tokens,
+        )
+        agent1 = Agent(llm=llm1, tools=self.tools)
+
+        conv1 = LocalConversation(
+            agent=agent1,
+            workspace=self.workspace,
+            persistence_dir=self.persistence_dir,
+            visualizer=None,
+        )
+
+        # Persisted state settings (should be restored from persistence on resume)
+        conv1.state.confirmation_policy = AlwaysConfirm()
+        conv1.state.execution_status = ConversationExecutionStatus.ERROR
+
+        # Ensure there's at least one user + assistant message pair in history.
+        # This exercises the full create -> persist -> resume path with events.
+        conv1.send_message(INSTRUCTION)
+        conv1.run()
+
+        conversation_id = conv1.id
+        conv1_event_count = len(conv1.state.events)
+        print(f"[t10] conv1 persisted events: {conv1_event_count}")
+
+        # Read persisted base_state.json and ensure it contains the original model.
+        # LocalConversation persists to:
+        #   <persistence_dir>/<conversation_id.hex>/base_state.json
+        base_state_path = os.path.join(
+            self.persistence_dir, conversation_id.hex, "base_state.json"
+        )
+        if not os.path.exists(base_state_path):
+            return TestResult(
+                success=False,
+                reason=(
+                    f"Expected persisted base_state.json not found at {base_state_path}"
+                ),
+            )
+
+        with open(base_state_path) as f:
+            base_state = json.load(f)
+
+        persisted_llm = base_state.get("agent", {}).get("llm", {})
+        persisted_model = persisted_llm.get("model")
+        persisted_max_input_tokens = persisted_llm.get("max_input_tokens")
+        persisted_usage_id = persisted_llm.get("usage_id")
+
+        if persisted_model != llm1.model:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected persisted agent.llm.model to match runtime llm1.model, "
+                    f"got {persisted_model!r} (expected {llm1.model!r})"
+                ),
+            )
+
+        if persisted_max_input_tokens != llm1.max_input_tokens:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected persisted agent.llm.max_input_tokens to match runtime "
+                    f"llm1.max_input_tokens={llm1.max_input_tokens!r}, got "
+                    f"{persisted_max_input_tokens!r}"
+                ),
+            )
+
+        if persisted_usage_id != "restore-test-llm-1":
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected persisted agent.llm.usage_id to be 'restore-test-llm-1', "
+                    f"got {persisted_usage_id!r}"
+                ),
+            )
+
+        del conv1
+
+        # Resume: provide a *different* runtime agent/LLM configuration.
+        # We load llm2 config from RESTORE_LLM_CONFIG_2 (JSON string), but always
+        # use the CI-provided base_url/api_key.
+        llm2_config_raw = os.environ.get("RESTORE_LLM_CONFIG_2")
+        if not llm2_config_raw:
+            return TestResult(
+                success=False,
+                reason="RESTORE_LLM_CONFIG_2 is required for t10_restore_conversation",
+            )
+
+        try:
+            llm2_config = json.loads(llm2_config_raw)
+        except json.JSONDecodeError as e:
+            return TestResult(
+                success=False,
+                reason=f"RESTORE_LLM_CONFIG_2 is not valid JSON: {e}",
+            )
+
+        llm2 = LLM(
+            model=llm2_config["model"],
+            base_url=self.llm.base_url,
+            api_key=self.llm.api_key,
+            usage_id="restore-test-llm-2",
+            max_input_tokens=llm2_config.get("max_input_tokens"),
+        )
+        agent2 = Agent(llm=llm2, tools=self.tools)
+
+        conv2 = LocalConversation(
+            agent=agent2,
+            workspace=self.workspace,
+            persistence_dir=self.persistence_dir,
+            conversation_id=conversation_id,
+            visualizer=None,
+        )
+
+        conv2_event_count = len(conv2.state.events)
+        print(f"[t10] conv2 loaded events: {conv2_event_count}")
+        if conv2_event_count != conv1_event_count:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Event count mismatch after restore: "
+                    f"before={conv1_event_count} after={conv2_event_count}"
+                ),
+            )
+
+        # 1) Persisted state settings should be restored on resume.
+        if not conv2.state.confirmation_policy.should_confirm():
+            return TestResult(
+                success=False,
+                reason="confirmation_policy was not restored from persistence",
+            )
+
+        # The restored conversation should be in a normal resumable state.
+        # We expect it to have reached FINISHED after the initial run.
+        if conv2.state.execution_status != ConversationExecutionStatus.FINISHED:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected execution_status=FINISHED after restore, got "
+                    f"{conv2.state.execution_status!r}"
+                ),
+            )
+
+        # Prove the restored conversation can continue.
+        conv2.state.execution_status = ConversationExecutionStatus.ERROR
+        conv2.send_message("are you still there?")
+        conv2.run()
+
+        # After a successful run, we should not remain in an error state.
+        if conv2.state.execution_status == ConversationExecutionStatus.ERROR:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected restored conversation to make progress after a new "
+                    "user message, but execution_status is still ERROR."
+                ),
+            )
+
+        # 2) Runtime agent/LLM should be used.
+        if conv2.agent.llm.model != llm2.model:
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected runtime agent llm.model to match llm2.model after "
+                    f"resume, got {conv2.agent.llm.model!r} (expected {llm2.model!r})"
+                ),
+            )
+        if (
+            llm2.max_input_tokens is not None
+            and conv2.agent.llm.max_input_tokens != llm2.max_input_tokens
+        ):
+            return TestResult(
+                success=False,
+                reason=(
+                    "Expected runtime max_input_tokens to match llm2.max_input_tokens "
+                    f"after resume, got {conv2.agent.llm.max_input_tokens!r} "
+                    f"(expected {llm2.max_input_tokens!r})"
+                ),
+            )
+
+        return TestResult(success=True, reason="Restore semantics verified")