diff --git a/openhands-sdk/openhands/sdk/agent/base.py b/openhands-sdk/openhands/sdk/agent/base.py index 1bf2bbb166..5d65cc090c 100644 --- a/openhands-sdk/openhands/sdk/agent/base.py +++ b/openhands-sdk/openhands/sdk/agent/base.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, ConfigDict, Field, PrivateAttr from openhands.sdk.context.agent_context import AgentContext -from openhands.sdk.context.condenser import CondenserBase, LLMSummarizingCondenser +from openhands.sdk.context.condenser import CondenserBase from openhands.sdk.context.prompts.prompt import render_template from openhands.sdk.llm import LLM from openhands.sdk.llm.utils.model_prompt_spec import get_model_prompt_spec @@ -17,7 +17,6 @@ from openhands.sdk.mcp import create_mcp_tools from openhands.sdk.tool import BUILT_IN_TOOLS, Tool, ToolDefinition, resolve_tool from openhands.sdk.utils.models import DiscriminatedUnionMixin -from openhands.sdk.utils.pydantic_diff import pretty_pydantic_diff if TYPE_CHECKING: @@ -300,64 +299,52 @@ def step( NOTE: state will be mutated in-place. """ - def resolve_diff_from_deserialized( + def verify( self, persisted: "AgentBase", events: "Sequence[Any] | None" = None, ) -> "AgentBase": - """ - Return a new AgentBase instance equivalent to `persisted` but with - explicitly whitelisted fields (e.g. api_key) taken from `self`. + """Verify that we can resume this agent from persisted state. + + This PR's goal is to *not* reconcile configuration between persisted and + runtime Agent instances. Instead, we verify compatibility requirements + and then continue with the runtime-provided Agent. + + Compatibility requirements: + - Agent class/type must match. + - Tools: + - If events are provided, only tools that were actually used in history + must exist in runtime. + - If events are not provided, tool names must match exactly. + + All other configuration (LLM, agent_context, condenser, system prompts, + etc.) can be freely changed between sessions. Args: - persisted: The persisted agent from the conversation state. - events: Optional event sequence to scan for used tools if tool - names don't match. Only scanned when needed (O(n) fallback). + persisted: The agent loaded from persisted state. + events: Optional event sequence to scan for used tools if tool names + don't match. + + Returns: + This runtime agent (self) if verification passes. + + Raises: + ValueError: If agent class or tools don't match. """ if persisted.__class__ is not self.__class__: raise ValueError( - f"Cannot resolve from deserialized: persisted agent is of type " + "Cannot load from persisted: persisted agent is of type " f"{persisted.__class__.__name__}, but self is of type " f"{self.__class__.__name__}." ) - # Get all LLMs from both self and persisted to reconcile them - new_llm = self.llm.resolve_diff_from_deserialized(persisted.llm) - updates: dict[str, Any] = {"llm": new_llm} - - # Reconcile the condenser's LLM if it exists - if self.condenser is not None and persisted.condenser is not None: - # Check if both condensers are LLMSummarizingCondenser - # (which has an llm field) - - if isinstance(self.condenser, LLMSummarizingCondenser) and isinstance( - persisted.condenser, LLMSummarizingCondenser - ): - new_condenser_llm = self.condenser.llm.resolve_diff_from_deserialized( - persisted.condenser.llm - ) - new_condenser = persisted.condenser.model_copy( - update={"llm": new_condenser_llm} - ) - updates["condenser"] = new_condenser - - # Reconcile agent_context - always use the current environment's agent_context - # This allows resuming conversations from different directories and handles - # cases where skills, working directory, or other context has changed - if self.agent_context is not None: - updates["agent_context"] = self.agent_context - - # Get tool names for comparison runtime_names = {tool.name for tool in self.tools} persisted_names = {tool.name for tool in persisted.tools} - # If tool names match exactly, no need to check event history if runtime_names == persisted_names: - # Tools unchanged, proceed normally - pass - elif events is not None: - # Tool names differ - scan events to find which tools were actually used - # This is O(n) but only happens when tools change + return self + + if events is not None: from openhands.sdk.event import ActionEvent used_tools = { @@ -366,43 +353,31 @@ def resolve_diff_from_deserialized( if isinstance(event, ActionEvent) and event.tool_name } - # Only require tools that were actually used in history + # Only require tools that were actually used in history. missing_used_tools = used_tools - runtime_names if missing_used_tools: raise ValueError( - f"Cannot resume conversation: tools that were used in history " + "Cannot resume conversation: tools that were used in history " f"are missing from runtime: {sorted(missing_used_tools)}. " f"Available tools: {sorted(runtime_names)}" ) - # Update tools to match runtime (allows new tools to be added) - updates["tools"] = self.tools - else: - # No events provided - strict matching (legacy behavior) - missing_in_runtime = persisted_names - runtime_names - missing_in_persisted = runtime_names - persisted_names - error_msg = "Tools don't match between runtime and persisted agents." - if missing_in_runtime: - error_msg += f" Missing in runtime: {sorted(missing_in_runtime)}." - if missing_in_persisted: - error_msg += f" Missing in persisted: {sorted(missing_in_persisted)}." - raise ValueError(error_msg) - - reconciled = persisted.model_copy(update=updates) - - # Validate agent equality - exclude tools from comparison since we - # already validated tool requirements above - exclude_fields = {"tools"} if events is not None else set() - self_dump = self.model_dump(exclude_none=True, exclude=exclude_fields) - reconciled_dump = reconciled.model_dump( - exclude_none=True, exclude=exclude_fields - ) - if self_dump != reconciled_dump: - raise ValueError( - "The Agent provided is different from the one in persisted state.\n" - f"Diff: {pretty_pydantic_diff(self, reconciled)}" - ) - return reconciled + return self + + # No events provided: strict tool name matching. + missing_in_runtime = persisted_names - runtime_names + missing_in_persisted = runtime_names - persisted_names + + details: list[str] = [] + if missing_in_runtime: + details.append(f"Missing in runtime: {sorted(missing_in_runtime)}") + if missing_in_persisted: + details.append(f"Missing in persisted: {sorted(missing_in_persisted)}") + + suffix = f" ({'; '.join(details)})" if details else "" + raise ValueError( + "Tools don't match between runtime and persisted agents." + suffix + ) def model_dump_succint(self, **kwargs): """Like model_dump, but excludes None fields by default.""" diff --git a/openhands-sdk/openhands/sdk/conversation/state.py b/openhands-sdk/openhands/sdk/conversation/state.py index d4d179e15f..77dfca9e5f 100644 --- a/openhands-sdk/openhands/sdk/conversation/state.py +++ b/openhands-sdk/openhands/sdk/conversation/state.py @@ -60,7 +60,10 @@ class ConversationState(OpenHandsModel): ) workspace: BaseWorkspace = Field( ..., - description="Working directory for agent operations and tool execution", + description=( + "Workspace used by the agent to execute commands and read/write files. " + "Not the process working directory." + ), ) persistence_dir: str | None = Field( default="workspace/conversations", @@ -172,10 +175,35 @@ def create( max_iterations: int = 500, stuck_detection: bool = True, ) -> "ConversationState": - """ - If base_state.json exists: resume (attach EventLog, - reconcile agent, enforce id). - Else: create fresh (agent required), persist base, and return. + """Create a new conversation state or resume from persistence. + + This factory method handles both new conversation creation and resumption + from persisted state. + + **New conversation:** + The provided Agent is used directly. Pydantic validation happens via the + cls() constructor. + + **Restored conversation:** + The provided Agent is validated against the persisted agent using + agent.load(). Tools must match (they may have been used in conversation + history), but all other configuration can be freely changed: LLM, + agent_context, condenser, system prompts, etc. + + Args: + id: Unique conversation identifier + agent: The Agent to use (tools must match persisted on restore) + workspace: Working directory for agent operations + persistence_dir: Directory for persisting state and events + max_iterations: Maximum iterations per run + stuck_detection: Whether to enable stuck detection + + Returns: + ConversationState ready for use + + Raises: + ValueError: If conversation ID or tools mismatch on restore + ValidationError: If agent or other fields fail Pydantic validation """ file_store = ( LocalFileStore(persistence_dir, cache_limit_size=max_iterations) @@ -192,29 +220,28 @@ def create( if base_text: state = cls.model_validate(json.loads(base_text)) - # Enforce conversation id match + # Restore the conversation with the same id if state.id != id: raise ValueError( f"Conversation ID mismatch: provided {id}, " f"but persisted state has {state.id}" ) - # Attach event log early so we can read history + # Attach event log early so we can read history for tool verification state._fs = file_store state._events = EventLog(file_store, dir_path=EVENTS_DIR) - # Reconcile agent config with deserialized one - # Pass event log so tool usage can be checked on-the-fly if needed - resolved = agent.resolve_diff_from_deserialized( - state.agent, events=state._events - ) + # Verify compatibility (agent class + tools) + agent.verify(state.agent, events=state._events) - # Commit reconciled agent (may autosave) + # Commit runtime-provided values (may autosave) state._autosave_enabled = True - state.agent = resolved + state.agent = agent + state.workspace = workspace + state.max_iterations = max_iterations - # Note: stats are already deserialized from base_state.json above - # Do NOT reset stats here - this would lose accumulated metrics + # Note: stats are already deserialized from base_state.json above. + # Do NOT reset stats here - this would lose accumulated metrics. logger.info( f"Resumed conversation {state.id} from persistent storage.\n" @@ -237,8 +264,6 @@ def create( max_iterations=max_iterations, stuck_detection=stuck_detection, ) - # Record existing analyzer configuration in state - state.security_analyzer = state.security_analyzer state._fs = file_store state._events = EventLog(file_store, dir_path=EVENTS_DIR) state.stats = ConversationStats() diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index e20bfca759..a88be306dd 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -28,8 +28,6 @@ if TYPE_CHECKING: # type hints only, avoid runtime import cycle from openhands.sdk.tool.tool import ToolDefinition -from openhands.sdk.utils.pydantic_diff import pretty_pydantic_diff - with warnings.catch_warnings(): warnings.simplefilter("ignore") @@ -322,19 +320,6 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): exclude=True, ) _metrics: Metrics | None = PrivateAttr(default=None) - # ===== Plain class vars (NOT Fields) ===== - # When serializing, these fields (SecretStr) will be dump to "****" - # When deserializing, these fields will be ignored and we will override - # them from the LLM instance provided at runtime. - OVERRIDE_ON_SERIALIZE: tuple[str, ...] = ( - "api_key", - "aws_access_key_id", - "aws_secret_access_key", - # Dynamic runtime metadata for telemetry/routing that can differ across sessions - # and should not cause resume-time diffs. Always prefer the runtime value. - "litellm_extra_body", - ) - # Runtime-only private attrs _model_info: Any = PrivateAttr(default=None) _tokenizer: Any = PrivateAttr(default=None) @@ -1101,39 +1086,3 @@ def _cast_value(raw: str, t: Any) -> Any: if v is not None: data[field_name] = v return cls(**data) - - def resolve_diff_from_deserialized(self, persisted: LLM) -> LLM: - """Resolve differences between a deserialized LLM and the current instance. - - This is due to fields like api_key being serialized to "****" in dumps, - and we want to ensure that when loading from a file, we still use the - runtime-provided api_key in the self instance. - - Return a new LLM instance equivalent to `persisted` but with - explicitly whitelisted fields (e.g. api_key) taken from `self`. - """ - if persisted.__class__ is not self.__class__: - raise ValueError( - f"Cannot resolve_diff_from_deserialized between {self.__class__} " - f"and {persisted.__class__}" - ) - - # Copy allowed fields from runtime llm into the persisted llm - llm_updates = {} - persisted_dump = persisted.model_dump(context={"expose_secrets": True}) - for field in self.OVERRIDE_ON_SERIALIZE: - if field in persisted_dump.keys(): - llm_updates[field] = getattr(self, field) - if llm_updates: - reconciled = persisted.model_copy(update=llm_updates) - else: - reconciled = persisted - - dump = self.model_dump(context={"expose_secrets": True}) - reconciled_dump = reconciled.model_dump(context={"expose_secrets": True}) - if dump != reconciled_dump: - raise ValueError( - "The LLM provided is different from the one in persisted state.\n" - f"Diff: {pretty_pydantic_diff(self, reconciled)}" - ) - return reconciled diff --git a/tests/cross/test_agent_reconciliation.py b/tests/cross/test_agent_loading.py similarity index 76% rename from tests/cross/test_agent_reconciliation.py rename to tests/cross/test_agent_loading.py index 1696fffdb6..5bce5ee614 100644 --- a/tests/cross/test_agent_reconciliation.py +++ b/tests/cross/test_agent_loading.py @@ -1,19 +1,20 @@ -"""Test agent reconciliation logic in agent deserialization and conversation restart.""" +"""Test agent loading (conversation restart) behavior.""" import tempfile import uuid from unittest.mock import patch +import pytest from pydantic import SecretStr from openhands.sdk import Agent -from openhands.sdk.agent import AgentBase from openhands.sdk.context import AgentContext, Skill from openhands.sdk.context.condenser.llm_summarizing_condenser import ( LLMSummarizingCondenser, ) from openhands.sdk.conversation import Conversation from openhands.sdk.conversation.impl.local_conversation import LocalConversation +from openhands.sdk.conversation.state import ConversationExecutionStatus from openhands.sdk.llm import LLM, Message, TextContent from openhands.sdk.tool import Tool, register_tool from openhands.tools.file_editor import FileEditorTool @@ -25,6 +26,10 @@ register_tool("FileEditorTool", FileEditorTool) +class ModuleScopeOtherAgent(Agent): + pass + + # Tests from test_llm_reconciliation.py def test_conversation_restart_with_nested_llms(tmp_path): """Test conversation restart with agent containing nested LLMs.""" @@ -106,7 +111,7 @@ def test_conversation_restarted_with_changed_working_directory(tmp_path_factory) ) -# Tests from test_local_conversation_tools_integration.py +# Tests for agent tools restriction and LLM flexibility def test_conversation_allows_removing_unused_tools(): """Test that removing tools that weren't used in history is allowed. @@ -263,8 +268,6 @@ def test_conversation_fails_when_used_tool_is_missing(): reduced_agent = Agent(llm=llm2, tools=reduced_tools) # This should raise - TerminalTool was used in history but is now missing - import pytest - with pytest.raises(ValueError, match="tools that were used in history"): LocalConversation( agent=reduced_agent, @@ -328,57 +331,86 @@ def test_conversation_with_same_agent_succeeds(): assert len(new_conversation.state.events) > 0 -def test_agent_resolve_diff_from_deserialized(): - """Test agent's resolve_diff_from_deserialized method. - - Includes tolerance for litellm_extra_body differences injected at CLI load time. - """ - with tempfile.TemporaryDirectory(): - # Create original agent +def test_conversation_with_different_llm_succeeds(): + """Test that using an agent with different LLM succeeds (LLM can change).""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create and save conversation with original agent tools = [Tool(name="TerminalTool")] - llm = LLM( + llm1 = LLM( model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" ) - original_agent = Agent(llm=llm, tools=tools) + original_agent = Agent(llm=llm1, tools=tools) + conversation = LocalConversation( + agent=original_agent, + workspace=temp_dir, + persistence_dir=temp_dir, + visualizer=None, + ) - # Serialize and deserialize to simulate persistence - serialized = original_agent.model_dump_json() - deserialized_agent = AgentBase.model_validate_json(serialized) + # Send a message to create some state + conversation.send_message( + Message(role="user", content=[TextContent(text="test message")]) + ) - # Create runtime agent with same configuration + conversation_id = conversation.state.id + del conversation + + # Create new conversation with different LLM - this should succeed llm2 = LLM( - model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + model="gpt-4o", # Different model + api_key=SecretStr("different-key"), # Different key + usage_id="different-llm", ) - runtime_agent = Agent(llm=llm2, tools=tools) - - # Should resolve successfully - resolved = runtime_agent.resolve_diff_from_deserialized(deserialized_agent) - # Test model_dump equality - assert resolved.model_dump(mode="json") == runtime_agent.model_dump(mode="json") - assert resolved.llm.model == runtime_agent.llm.model - assert resolved.__class__ == runtime_agent.__class__ - - # Now simulate CLI injecting dynamic litellm_extra_body metadata at load time - injected = deserialized_agent.model_copy( - update={ - "llm": deserialized_agent.llm.model_copy( - update={ - "litellm_extra_body": { - "metadata": { - "session_id": "sess-123", - "tags": ["app:openhands", "model:gpt-4o-mini"], - "trace_version": "1.2.3", - } - } - } - ) - } - ) - - # Reconcile again: differences in litellm_extra_body should be allowed and - # the runtime value should be preferred without raising an error. - resolved2 = runtime_agent.resolve_diff_from_deserialized(injected) - assert resolved2.llm.litellm_extra_body == runtime_agent.llm.litellm_extra_body + different_agent = Agent(llm=llm2, tools=tools) + + # This should succeed - LLM can be freely changed between sessions + new_conversation = LocalConversation( + agent=different_agent, + workspace=temp_dir, + persistence_dir=temp_dir, + conversation_id=conversation_id, + visualizer=None, + ) + + # Verify state was loaded and new agent with new LLM is used + assert len(new_conversation.state.events) > 0 + assert new_conversation.agent.llm.model == "gpt-4o" + assert new_conversation.agent.llm.usage_id == "different-llm" + + +def test_conversation_fails_when_agent_type_changes(): + """Test that resuming with a different Agent class fails. + + This is a hard compatibility requirement: we can only resume if the runtime + agent is the same class as the persisted agent. + + Note: we define the alternative Agent at module scope to ensure the persisted + snapshot can be deserialized; otherwise, Pydantic rejects local classes. + """ + + tools = [Tool(name="TerminalTool")] + + llm1 = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="llm") + llm2 = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="llm") + + with tempfile.TemporaryDirectory() as temp_dir: + conversation = LocalConversation( + agent=Agent(llm=llm1, tools=tools), + workspace=temp_dir, + persistence_dir=temp_dir, + visualizer=None, + ) + conversation_id = conversation.state.id + del conversation + + with pytest.raises(ValueError, match=r"persisted agent is of type"): + LocalConversation( + agent=ModuleScopeOtherAgent(llm=llm2, tools=tools), + workspace=temp_dir, + persistence_dir=temp_dir, + conversation_id=conversation_id, + visualizer=None, + ) @patch("openhands.sdk.llm.llm.litellm_completion") @@ -458,6 +490,74 @@ def test_conversation_persistence_lifecycle(mock_completion): assert len(new_conversation.state.events) >= original_event_count + 2 +def test_conversation_resume_overrides_agent_llm_but_preserves_state_settings(): + """Test resume behavior when changing runtime Agent/LLM settings. + + Expectations: + - Some conversation *state* settings are persisted and should not be overridden + on resume (e.g., confirmation_policy, execution_status). + - Agent/LLM settings should come from the runtime-provided Agent on resume + + This test covers the common workflow: start a persisted conversation, tweak a + couple of state settings, then resume with a different LLM configuration. + """ + + from openhands.sdk.security.confirmation_policy import AlwaysConfirm + + with tempfile.TemporaryDirectory() as temp_dir: + tools = [Tool(name="TerminalTool")] + + # Initial agent (persisted snapshot contains this agent config, but on resume + # we should use the runtime-provided agent). + llm1 = LLM( + model="gpt-5.1-codex-max", + api_key=SecretStr("test-key-1"), + usage_id="llm-1", + max_input_tokens=100_000, + ) + agent1 = Agent(llm=llm1, tools=tools) + + conversation = LocalConversation( + agent=agent1, + workspace=temp_dir, + persistence_dir=temp_dir, + visualizer=None, + ) + + # Persisted state settings (these should be restored from persistence). + conversation.state.confirmation_policy = AlwaysConfirm() + conversation.state.execution_status = ConversationExecutionStatus.STUCK + + conversation_id = conversation.state.id + del conversation + + # Resume with a different runtime Agent + LLM settings. + llm2 = LLM( + model="gpt-5.2", + api_key=SecretStr("test-key-2"), + usage_id="llm-2", + max_input_tokens=50_000, + ) + agent2 = Agent(llm=llm2, tools=tools) + + resumed = LocalConversation( + agent=agent2, + workspace=temp_dir, + persistence_dir=temp_dir, + conversation_id=conversation_id, + visualizer=None, + ) + + # Persisted settings should remain. + assert resumed.state.execution_status == ConversationExecutionStatus.STUCK + assert resumed.state.confirmation_policy.should_confirm() + + # Runtime agent/LLM settings should override persisted agent snapshot. + assert resumed.agent.llm.model == "gpt-5.2" + assert resumed.agent.llm.max_input_tokens == 50_000 + assert resumed.agent.llm.usage_id == "llm-2" + + def test_conversation_restart_with_different_agent_context(): """ Test conversation restart when agent_context differs. diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py index 3fee0ba85e..eaee970b08 100644 --- a/tests/sdk/conversation/local/test_state_serialization.py +++ b/tests/sdk/conversation/local/test_state_serialization.py @@ -9,16 +9,11 @@ from pydantic import SecretStr, ValidationError from openhands.sdk import Agent, Conversation -from openhands.sdk.agent.base import AgentBase from openhands.sdk.conversation.impl.local_conversation import LocalConversation from openhands.sdk.conversation.state import ( ConversationExecutionStatus, ConversationState, ) -from openhands.sdk.conversation.types import ( - ConversationCallbackType, - ConversationTokenCallbackType, -) from openhands.sdk.event.llm_convertible import MessageEvent, SystemPromptEvent from openhands.sdk.llm import LLM, Message, TextContent from openhands.sdk.llm.llm_registry import RegistryEvent @@ -427,8 +422,79 @@ def test_conversation_state_thread_safety(): assert not state.owned() -def test_agent_resolve_diff_different_class_raises_error(): - """Test that resolve_diff_from_deserialized raises error for different agent classes.""" # noqa: E501 +def test_agent_pydantic_validation_on_creation(): + """Test that Pydantic validation happens when creating agents.""" + # Valid agent creation - Pydantic validates + llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + agent = Agent(llm=llm, tools=[]) + assert agent.llm.model == "gpt-4o-mini" + + # Invalid LLM creation should fail Pydantic validation + with pytest.raises(ValueError, match="model must be specified"): + LLM(model="", api_key=SecretStr("test-key"), usage_id="test-llm") + + +def test_agent_verify_validates_tools_match(): + """Test that agent.verify() validates tools match between runtime and persisted.""" + from openhands.sdk.agent import AgentBase + from openhands.sdk.tool import Tool + + llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") + + # Create original agent with two tools + original_agent = Agent( + llm=llm, tools=[Tool(name="TerminalTool"), Tool(name="FileEditorTool")] + ) + + # Serialize and deserialize to simulate persistence + serialized = original_agent.model_dump_json() + persisted_agent = AgentBase.model_validate_json(serialized) + + # Runtime agent with same tools should succeed + same_tools_agent = Agent( + llm=llm, tools=[Tool(name="TerminalTool"), Tool(name="FileEditorTool")] + ) + result = same_tools_agent.verify(persisted_agent) + assert result is same_tools_agent + + # Runtime agent with different tools should fail + different_tools_agent = Agent(llm=llm, tools=[Tool(name="TerminalTool")]) + with pytest.raises( + ValueError, match="Tools don't match between runtime and persisted agents" + ): + different_tools_agent.verify(persisted_agent) + + +def test_agent_verify_allows_different_llm(): + """Test that agent.verify() allows different LLM configuration.""" + from openhands.sdk.agent import AgentBase + from openhands.sdk.tool import Tool + + tools = [Tool(name="TerminalTool")] + + # Create original agent + llm1 = LLM(model="gpt-4o-mini", api_key=SecretStr("key1"), usage_id="llm1") + original_agent = Agent(llm=llm1, tools=tools) + + # Serialize and deserialize + serialized = original_agent.model_dump_json() + persisted_agent = AgentBase.model_validate_json(serialized) + + # Runtime agent with different LLM should succeed (LLM can change freely) + llm2 = LLM(model="gpt-4o", api_key=SecretStr("key2"), usage_id="llm2") + different_llm_agent = Agent(llm=llm2, tools=tools) + result = different_llm_agent.verify(persisted_agent) + assert result is different_llm_agent + assert result.llm.model == "gpt-4o" + + +def test_agent_verify_different_class_raises_error(): + """Test that agent.verify() raises error for different agent classes.""" + from openhands.sdk.agent.base import AgentBase + from openhands.sdk.conversation.types import ( + ConversationCallbackType, + ConversationTokenCallbackType, + ) class DifferentAgent(AgentBase): def __init__(self): @@ -454,8 +520,8 @@ def step( original_agent = Agent(llm=llm, tools=[]) different_agent = DifferentAgent() - with pytest.raises(ValueError, match="Cannot resolve from deserialized"): - original_agent.resolve_diff_from_deserialized(different_agent) + with pytest.raises(ValueError, match="Cannot load from persisted"): + original_agent.verify(different_agent) def test_conversation_state_flags_persistence(): @@ -556,12 +622,102 @@ def test_conversation_with_agent_different_llm_config(): assert new_dump == original_state_dump -def test_conversation_state_stats_preserved_on_resume(): - """Test that conversation stats are preserved when resuming a conversation. +def test_resume_uses_runtime_workspace_and_max_iterations(): + """Test that resume uses runtime-provided workspace and max_iterations.""" + with tempfile.TemporaryDirectory() as temp_dir: + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_id = uuid.UUID("12345678-1234-5678-9abc-123456789007") + persist_path = LocalConversation.get_persistence_dir(temp_dir, conv_id) + + original_workspace = LocalWorkspace(working_dir="/original/path") + state = ConversationState.create( + workspace=original_workspace, + persistence_dir=persist_path, + agent=agent, + id=conv_id, + max_iterations=100, + ) + assert state.max_iterations == 100 + + new_workspace = LocalWorkspace(working_dir="/new/path") + resumed_state = ConversationState.create( + workspace=new_workspace, + persistence_dir=persist_path, + agent=agent, + id=conv_id, + max_iterations=200, + ) + + assert resumed_state.workspace.working_dir == "/new/path" + assert resumed_state.max_iterations == 200 + + +def test_resume_preserves_persisted_execution_status_and_stuck_detection(): + """Test that resume preserves execution_status and stuck_detection.""" + with tempfile.TemporaryDirectory() as temp_dir: + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_id = uuid.UUID("12345678-1234-5678-9abc-123456789008") + persist_path = LocalConversation.get_persistence_dir(temp_dir, conv_id) + + state = ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=conv_id, + stuck_detection=False, + ) + state.execution_status = ConversationExecutionStatus.PAUSED - This is a regression test for the issue where stats (including context_window) - were being reset to zero when resuming a conversation. - """ + resumed_state = ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=conv_id, + stuck_detection=True, + ) + + assert resumed_state.execution_status == ConversationExecutionStatus.PAUSED + assert resumed_state.stuck_detection is False + + +def test_resume_preserves_blocked_actions_and_messages(): + """Test that resume preserves blocked_actions and blocked_messages.""" + with tempfile.TemporaryDirectory() as temp_dir: + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_id = uuid.UUID("12345678-1234-5678-9abc-123456789009") + persist_path = LocalConversation.get_persistence_dir(temp_dir, conv_id) + + state = ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=conv_id, + ) + state.block_action("action-1", "dangerous action") + state.block_message("msg-1", "inappropriate content") + + resumed_state = ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=conv_id, + ) + + assert resumed_state.blocked_actions["action-1"] == "dangerous action" + assert resumed_state.blocked_messages["msg-1"] == "inappropriate content" + + +def test_conversation_state_stats_preserved_on_resume(): + """Regression: stats should not be reset when resuming a conversation.""" with tempfile.TemporaryDirectory() as temp_dir: llm = LLM( model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" @@ -579,7 +735,6 @@ def test_conversation_state_stats_preserved_on_resume(): id=conv_id, ) - # Register the LLM and add some metrics state.stats.register_llm(RegistryEvent(llm=llm)) # Add token usage with context_window @@ -617,7 +772,6 @@ def test_conversation_state_stats_preserved_on_resume(): ) new_agent = Agent(llm=new_llm, tools=[]) - # Resume the conversation resumed_state = ConversationState.create( workspace=LocalWorkspace(working_dir="/tmp"), persistence_dir=persist_path_for_state, @@ -637,3 +791,30 @@ def test_conversation_state_stats_preserved_on_resume(): assert ( resumed_combined_metrics.accumulated_token_usage.context_window == 128000 ), "Context window should be preserved after resume" + + +def test_resume_with_conversation_id_mismatch_raises_error(): + """Test that resuming with mismatched conversation ID raises error.""" + with tempfile.TemporaryDirectory() as temp_dir: + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + original_id = uuid.UUID("12345678-1234-5678-9abc-12345678900b") + different_id = uuid.UUID("12345678-1234-5678-9abc-12345678900c") + persist_path = LocalConversation.get_persistence_dir(temp_dir, original_id) + + ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=original_id, + ) + + with pytest.raises(ValueError, match="Conversation ID mismatch"): + ConversationState.create( + workspace=LocalWorkspace(working_dir="/tmp"), + persistence_dir=persist_path, + agent=agent, + id=different_id, + ) diff --git a/tests/sdk/llm/test_llm_reconciliation.py b/tests/sdk/llm/test_llm_reconciliation.py deleted file mode 100644 index 2cd86d75e2..0000000000 --- a/tests/sdk/llm/test_llm_reconciliation.py +++ /dev/null @@ -1,42 +0,0 @@ -from pydantic import SecretStr - -from openhands.sdk import Agent -from openhands.sdk.agent import AgentBase -from openhands.sdk.llm import LLM - - -def test_resolve_diff_ignores_litellm_extra_body_diffs(): - tools = [] - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") - original_agent = Agent(llm=llm, tools=tools) - - serialized = original_agent.model_dump_json() - deserialized_agent = AgentBase.model_validate_json(serialized) - - runtime_agent = Agent( - llm=LLM( - model="gpt-4o-mini", - api_key=SecretStr("test-key"), - usage_id="test-llm", - ), - tools=tools, - ) - - injected = deserialized_agent.model_copy( - update={ - "llm": deserialized_agent.llm.model_copy( - update={ - "litellm_extra_body": { - "metadata": { - "session_id": "sess-xyz", - "tags": ["app:openhands", "model:gpt-4o-mini"], - "trace_version": "9.9.9", - } - } - } - ) - } - ) - - resolved = runtime_agent.resolve_diff_from_deserialized(injected) - assert resolved.llm.litellm_extra_body == runtime_agent.llm.litellm_extra_body diff --git a/tests/sdk/llm/test_llm_serialization.py b/tests/sdk/llm/test_llm_serialization.py index b49771bf33..eeceef9723 100644 --- a/tests/sdk/llm/test_llm_serialization.py +++ b/tests/sdk/llm/test_llm_serialization.py @@ -62,6 +62,19 @@ def test_llm_secret_fields_serialization() -> None: assert deserialized_llm.aws_secret_access_key is None +def test_llm_model_dump_json_masks_secrets() -> None: + """Test that JSON serialization masks secrets by default.""" + llm = LLM( + usage_id="test-llm", + model="test-model", + api_key=SecretStr("secret-api-key"), + ) + + dumped = llm.model_dump_json() + assert "secret-api-key" not in dumped + assert "**********" in dumped + + def test_llm_excluded_fields_not_serialized() -> None: """Test that excluded fields are not included in serialization.""" # Create LLM with excluded fields