From 6819bffb817718902e663448065690f1c2222f4d Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 00:25:42 +0000 Subject: [PATCH 1/8] fix(tests): Move local test classes to module level for pytest-xdist compatibility Move local class definitions from inside test functions to module level to prevent test pollution when running tests in parallel with pytest-xdist. When classes like DifferentAgent, TestAction, etc. are defined inside test functions, they get registered with Pydantic's discriminated union system but are not importable, causing failures in other tests running in the same worker. Files changed: - test_state_serialization.py: Move _DifferentAgentForVerifyTest to module level - test_reasoning_content.py: Move _TestActionForReasoningContent to module level - test_schema_immutability.py: Move _SchemaImmutabilityCustomAction and _SchemaImmutabilityCustomObservation to module level This fixes test pollution that was masked by --forked flag, enabling the switch to pytest-xdist for accurate coverage reporting. Fixes #1656 Co-authored-by: openhands --- .../local/test_state_serialization.py | 61 +++++++++++-------- tests/sdk/llm/test_reasoning_content.py | 20 ++++-- tests/sdk/tool/test_schema_immutability.py | 41 +++++++++---- 3 files changed, 76 insertions(+), 46 deletions(-) diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py index eaee970b08..47a82fa9b9 100644 --- a/tests/sdk/conversation/local/test_state_serialization.py +++ b/tests/sdk/conversation/local/test_state_serialization.py @@ -9,11 +9,16 @@ from pydantic import SecretStr, ValidationError from openhands.sdk import Agent, Conversation +from openhands.sdk.agent.base import AgentBase from openhands.sdk.conversation.impl.local_conversation import LocalConversation from openhands.sdk.conversation.state import ( ConversationExecutionStatus, ConversationState, ) +from openhands.sdk.conversation.types import ( + ConversationCallbackType, + ConversationTokenCallbackType, +) from openhands.sdk.event.llm_convertible import MessageEvent, SystemPromptEvent from openhands.sdk.llm import LLM, Message, TextContent from openhands.sdk.llm.llm_registry import RegistryEvent @@ -21,6 +26,34 @@ from openhands.sdk.workspace import LocalWorkspace +class _DifferentAgentForVerifyTest(AgentBase): + """A different agent class used to test Agent.verify() rejects class mismatches. + + This class is defined at module level (rather than inside a test function) to ensure + it's importable by Pydantic during serialization/deserialization. Defining it inside + a test function causes test pollution when running tests in parallel with pytest-xdist. + """ + + def __init__(self): + llm = LLM( + model="gpt-4o-mini", + api_key=SecretStr("test-key"), + usage_id="test-llm", + ) + super().__init__(llm=llm, tools=[]) + + def init_state(self, state, on_event): + pass + + def step( + self, + conversation, + on_event: ConversationCallbackType, + on_token: ConversationTokenCallbackType | None = None, + ): + pass + + def test_conversation_state_basic_serialization(): """Test basic ConversationState serialization and deserialization.""" llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") @@ -490,35 +523,9 @@ def test_agent_verify_allows_different_llm(): def test_agent_verify_different_class_raises_error(): """Test that agent.verify() raises error for different agent classes.""" - from openhands.sdk.agent.base import AgentBase - from openhands.sdk.conversation.types import ( - ConversationCallbackType, - ConversationTokenCallbackType, - ) - - class DifferentAgent(AgentBase): - def __init__(self): - llm = LLM( - model="gpt-4o-mini", - api_key=SecretStr("test-key"), - usage_id="test-llm", - ) - super().__init__(llm=llm, tools=[]) - - def init_state(self, state, on_event): - pass - - def step( - self, - conversation, - on_event: ConversationCallbackType, - on_token: ConversationTokenCallbackType | None = None, - ): - pass - llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") original_agent = Agent(llm=llm, tools=[]) - different_agent = DifferentAgent() + different_agent = _DifferentAgentForVerifyTest() with pytest.raises(ValueError, match="Cannot load from persisted"): original_agent.verify(different_agent) diff --git a/tests/sdk/llm/test_reasoning_content.py b/tests/sdk/llm/test_reasoning_content.py index 41cedcbfb9..0fd5ddccbd 100644 --- a/tests/sdk/llm/test_reasoning_content.py +++ b/tests/sdk/llm/test_reasoning_content.py @@ -2,6 +2,19 @@ from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse, Usage +from openhands.sdk.tool import Action + + +class _TestActionForReasoningContent(Action): + """A test action used for testing reasoning content in ActionEvent. + + This class is defined at module level (rather than inside a test function) to ensure + it's importable by Pydantic during serialization/deserialization. Defining it inside + a test function causes test pollution when running tests in parallel with pytest-xdist. + """ + + action: str = "test" + def create_mock_response(content: str = "Test response", response_id: str = "test-id"): """Helper function to create properly structured mock responses.""" @@ -113,11 +126,6 @@ def test_action_event_with_reasoning_content(): MessageToolCall, TextContent, ) - from openhands.sdk.tool import Action - - # Create a simple action for testing - class TestAction(Action): - action: str = "test" # Create a tool call tool_call = MessageToolCall( @@ -129,7 +137,7 @@ class TestAction(Action): action_event = ActionEvent( thought=[TextContent(text="I need to test this")], - action=TestAction(), + action=_TestActionForReasoningContent(), tool_name="test_tool", tool_call_id="test-id", tool_call=tool_call, diff --git a/tests/sdk/tool/test_schema_immutability.py b/tests/sdk/tool/test_schema_immutability.py index d6df9d129a..8a9431a448 100644 --- a/tests/sdk/tool/test_schema_immutability.py +++ b/tests/sdk/tool/test_schema_immutability.py @@ -53,6 +53,32 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: return [TextContent(text=f"Result: {self.result}, Status: {self.status}")] +class _SchemaImmutabilityCustomAction(Action): + """Custom action for testing schema inheritance immutability. + + This class is defined at module level (rather than inside a test function) to ensure + it's importable by Pydantic during serialization/deserialization. Defining it inside + a test function causes test pollution when running tests in parallel with pytest-xdist. + """ + + custom_field: str = Field(description="Custom field") + + +class _SchemaImmutabilityCustomObservation(Observation): + """Custom observation for testing schema inheritance immutability. + + This class is defined at module level (rather than inside a test function) to ensure + it's importable by Pydantic during serialization/deserialization. Defining it inside + a test function causes test pollution when running tests in parallel with pytest-xdist. + """ + + custom_result: str = Field(description="Custom result") + + @property + def to_llm_content(self) -> Sequence[TextContent | ImageContent]: + return [TextContent(text=self.custom_result)] + + def test_schema_is_frozen(): """Test that Schema instances are frozen and cannot be modified.""" schema = MockSchema(name="test", value=42) @@ -273,22 +299,11 @@ def test_all_schema_classes_are_frozen(): def test_schema_inheritance_preserves_immutability(): """Test that classes inheriting from schema bases are also immutable.""" - - class SchemaImmutabilityCustomAction(Action): - custom_field: str = Field(description="Custom field") - - class SchemaImmutabilityCustomObservation(Observation): - custom_result: str = Field(description="Custom result") - - @property - def to_llm_content(self) -> Sequence[TextContent | ImageContent]: - return [TextContent(text=self.custom_result)] - # Test that custom classes are also frozen - custom_action = SchemaImmutabilityCustomAction(custom_field="test") + custom_action = _SchemaImmutabilityCustomAction(custom_field="test") with pytest.raises(ValidationError, match="Instance is frozen"): custom_action.custom_field = "changed" - custom_obs = SchemaImmutabilityCustomObservation(custom_result="test") + custom_obs = _SchemaImmutabilityCustomObservation(custom_result="test") with pytest.raises(ValidationError, match="Instance is frozen"): custom_obs.custom_result = "changed" From eaf9ddeab0d2c13302409c4247808ec6557ee23b Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 00:33:01 +0000 Subject: [PATCH 2/8] fix: wrap long lines in docstrings to pass ruff lint Co-authored-by: openhands --- .../conversation/local/test_state_serialization.py | 7 ++++--- tests/sdk/llm/test_reasoning_content.py | 7 ++++--- tests/sdk/tool/test_schema_immutability.py | 14 ++++++++------ 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py index 47a82fa9b9..fd6fe2765d 100644 --- a/tests/sdk/conversation/local/test_state_serialization.py +++ b/tests/sdk/conversation/local/test_state_serialization.py @@ -29,9 +29,10 @@ class _DifferentAgentForVerifyTest(AgentBase): """A different agent class used to test Agent.verify() rejects class mismatches. - This class is defined at module level (rather than inside a test function) to ensure - it's importable by Pydantic during serialization/deserialization. Defining it inside - a test function causes test pollution when running tests in parallel with pytest-xdist. + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. """ def __init__(self): diff --git a/tests/sdk/llm/test_reasoning_content.py b/tests/sdk/llm/test_reasoning_content.py index 0fd5ddccbd..39d6dddb7a 100644 --- a/tests/sdk/llm/test_reasoning_content.py +++ b/tests/sdk/llm/test_reasoning_content.py @@ -8,9 +8,10 @@ class _TestActionForReasoningContent(Action): """A test action used for testing reasoning content in ActionEvent. - This class is defined at module level (rather than inside a test function) to ensure - it's importable by Pydantic during serialization/deserialization. Defining it inside - a test function causes test pollution when running tests in parallel with pytest-xdist. + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. """ action: str = "test" diff --git a/tests/sdk/tool/test_schema_immutability.py b/tests/sdk/tool/test_schema_immutability.py index 8a9431a448..eff2aca134 100644 --- a/tests/sdk/tool/test_schema_immutability.py +++ b/tests/sdk/tool/test_schema_immutability.py @@ -56,9 +56,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: class _SchemaImmutabilityCustomAction(Action): """Custom action for testing schema inheritance immutability. - This class is defined at module level (rather than inside a test function) to ensure - it's importable by Pydantic during serialization/deserialization. Defining it inside - a test function causes test pollution when running tests in parallel with pytest-xdist. + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. """ custom_field: str = Field(description="Custom field") @@ -67,9 +68,10 @@ class _SchemaImmutabilityCustomAction(Action): class _SchemaImmutabilityCustomObservation(Observation): """Custom observation for testing schema inheritance immutability. - This class is defined at module level (rather than inside a test function) to ensure - it's importable by Pydantic during serialization/deserialization. Defining it inside - a test function causes test pollution when running tests in parallel with pytest-xdist. + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. """ custom_result: str = Field(description="Custom result") From 6573d5d03662266cbbeed7975b522e1af46e4702 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 05:36:46 +0000 Subject: [PATCH 3/8] ci: Replace --forked with pytest-xdist for accurate coverage The --forked flag prevents coverage collection from child processes, resulting in artificially low coverage reports (near 0% in some cases). Switching to pytest-xdist (-n auto) provides: - Parallel test execution (same benefit as --forked) - Proper coverage collection from all worker processes - Better test isolation through process separation This change affects sdk-tests, tools-tests, and agent-server-tests. Closes #1656 Co-authored-by: openhands --- .github/workflows/tests.yml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a125d720f8..dbb70ab60e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -57,8 +57,10 @@ jobs: run: | # Clean up any existing coverage file rm -f .coverage + # Use pytest-xdist (-n auto) for parallel execution with proper + # coverage collection. --forked prevents coverage from child processes. CI=true uv run python -m pytest -vvs \ - --forked \ + -n auto \ --cov=openhands-sdk \ --cov-report=term-missing \ --cov-fail-under=0 \ @@ -112,8 +114,10 @@ jobs: run: | # Clean up any existing coverage file rm -f .coverage + # Use pytest-xdist (-n auto) for parallel execution with proper + # coverage collection. --forked prevents coverage from child processes. CI=true uv run python -m pytest -vvs \ - --forked \ + -n auto \ --cov=openhands-tools \ --cov-report=term-missing \ --cov-fail-under=0 \ @@ -166,8 +170,10 @@ jobs: run: | # Clean up any existing coverage file rm -f .coverage + # Use pytest-xdist (-n auto) for parallel execution with proper + # coverage collection. --forked prevents coverage from child processes. CI=true uv run python -m pytest -vvs \ - --forked \ + -n auto \ --cov=openhands-agent-server \ --cov-report=term-missing \ --cov-fail-under=0 \ From af48df2582f5ad9f83612158c507d682c0f53272 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 05:42:43 +0000 Subject: [PATCH 4/8] fix(tests): Move TestEvent to module level for pytest-xdist compatibility Another local class in test_event_immutability.py was causing test pollution when running tests in parallel. Moved _TestEventForImmutability to module level. Co-authored-by: openhands --- tests/sdk/event/test_event_immutability.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/sdk/event/test_event_immutability.py b/tests/sdk/event/test_event_immutability.py index 8a231fd92d..4a8ece8742 100644 --- a/tests/sdk/event/test_event_immutability.py +++ b/tests/sdk/event/test_event_immutability.py @@ -75,13 +75,21 @@ def create(cls, *args, **kwargs) -> Sequence[Self]: ] -def test_event_base_is_frozen(): - """Test that Event instances are frozen and cannot be modified.""" +class _TestEventForImmutability(Event): + """Test event class for immutability tests. + + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. + """ - class TestEvent(Event): - test_field: str = "test_value" + test_field: str = "test_value" - event = TestEvent(source="agent", test_field="initial_value") + +def test_event_base_is_frozen(): + """Test that Event instances are frozen and cannot be modified.""" + event = _TestEventForImmutability(source="agent", test_field="initial_value") # Test that we cannot modify any field with pytest.raises(Exception): # Pydantic raises ValidationError for frozen models From 933b552194364938418a840d6c77653d14840d07 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 05:58:46 +0000 Subject: [PATCH 5/8] fix(tests): Move remaining local classes to module level for pytest-xdist Additional local classes causing test pollution when running in parallel: - _UnknownEventForVisualizerTest in test_visualizer.py - _NestedActionForMalformedArgs in test_fix_malformed_tool_arguments.py - _ChildMCPToolActionForSerialization in test_mcp_action_serialization.py Co-authored-by: openhands --- .../test_fix_malformed_tool_arguments.py | 22 +++++++++++------ tests/sdk/conversation/test_visualizer.py | 24 ++++++++++++------- .../sdk/mcp/test_mcp_action_serialization.py | 18 ++++++++++---- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/tests/sdk/agent/test_fix_malformed_tool_arguments.py b/tests/sdk/agent/test_fix_malformed_tool_arguments.py index 6a2ef28e94..9d5dda5e66 100644 --- a/tests/sdk/agent/test_fix_malformed_tool_arguments.py +++ b/tests/sdk/agent/test_fix_malformed_tool_arguments.py @@ -44,6 +44,19 @@ class JsonDecodingOptionalAction(Action): config: dict[str, int] | None = Field(default=None, description="Optional dict") +class _NestedActionForMalformedArgs(Action): + """Action with nested structures for testing JSON decoding. + + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. + """ + + nested_list: list[list[int]] = Field(description="Nested list") + nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict") + + def test_decode_json_string_list(): """Test that JSON string lists are decoded to native lists.""" data = { @@ -201,17 +214,12 @@ def test_json_string_with_wrong_type_rejected(): def test_nested_structures(): """Test that nested lists and dicts in JSON strings work.""" - - class NestedAction(Action): - nested_list: list[list[int]] = Field(description="Nested list") - nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict") - data = { "nested_list": "[[1, 2], [3, 4]]", "nested_dict": '{"outer": {"inner": "value"}}', } - fixed_data = fix_malformed_tool_arguments(data, NestedAction) - action = NestedAction.model_validate(fixed_data) + fixed_data = fix_malformed_tool_arguments(data, _NestedActionForMalformedArgs) + action = _NestedActionForMalformedArgs.model_validate(fixed_data) assert action.nested_list == [[1, 2], [3, 4]] assert action.nested_dict == {"outer": {"inner": "value"}} diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py index c88544ac1f..373bb01eeb 100644 --- a/tests/sdk/conversation/test_visualizer.py +++ b/tests/sdk/conversation/test_visualizer.py @@ -21,6 +21,8 @@ SystemPromptEvent, UserRejectObservation, ) +from openhands.sdk.event.base import Event +from openhands.sdk.event.types import SourceType from openhands.sdk.llm import ( Message, MessageToolCall, @@ -33,6 +35,18 @@ from openhands.sdk.conversation.impl.local_conversation import LocalConversation +class _UnknownEventForVisualizerTest(Event): + """Unknown event type for testing fallback visualization. + + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. + """ + + source: SourceType = "agent" + + class VisualizerMockAction(Action): """Mock action for testing.""" @@ -457,18 +471,12 @@ def test_metrics_abbreviation_formatting(): def test_event_base_fallback_visualize(): """Test that Event provides fallback visualization.""" - from openhands.sdk.event.base import Event - from openhands.sdk.event.types import SourceType - - class UnknownEvent(Event): - source: SourceType = "agent" - - event = UnknownEvent() + event = _UnknownEventForVisualizerTest() result = event.visualize assert isinstance(result, Text) text_content = result.plain - assert "Unknown event type: UnknownEvent" in text_content + assert "Unknown event type: _UnknownEventForVisualizerTest" in text_content def test_visualizer_conversation_state_update_event_skipped(): diff --git a/tests/sdk/mcp/test_mcp_action_serialization.py b/tests/sdk/mcp/test_mcp_action_serialization.py index 288953b50f..502686470c 100644 --- a/tests/sdk/mcp/test_mcp_action_serialization.py +++ b/tests/sdk/mcp/test_mcp_action_serialization.py @@ -4,6 +4,18 @@ from openhands.sdk.mcp import MCPToolAction +class _ChildMCPToolActionForSerialization(MCPToolAction): + """Child MCP action for testing declared fields with data. + + This class is defined at module level (rather than inside a test function) to + ensure it's importable by Pydantic during serialization/deserialization. + Defining it inside a test function causes test pollution when running tests + in parallel with pytest-xdist. + """ + + declared: int + + def test_data_field_emerges_from_to_mcp_arguments(): """Test that data field contents are returned by to_mcp_arguments.""" data = {"new_field": "value", "dynamic": 123} @@ -18,12 +30,8 @@ def test_data_field_emerges_from_to_mcp_arguments(): def test_declared_child_fields_with_data(): """Test that child classes work with the data field.""" - - class Child(MCPToolAction): - declared: int - data = {"tool_param": "value"} - a = Child(declared=7, data=data) + a = _ChildMCPToolActionForSerialization(declared=7, data=data) out = a.to_mcp_arguments() # Only data field contents should be in MCP arguments From f6af5c2a1980d10ebb19af921fc50b9dbce2cd4b Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 06:51:37 +0000 Subject: [PATCH 6/8] ci: bump tools-tests timeout to 25 minutes Co-authored-by: openhands --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dbb70ab60e..37e6adc06a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -82,7 +82,7 @@ jobs: tools-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 - timeout-minutes: 15 + timeout-minutes: 25 steps: - name: Checkout uses: actions/checkout@v5 From 911246dcf328a21fd1ae7565aa6eb4c316b7a481 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 06:59:05 +0000 Subject: [PATCH 7/8] ci: revert tools-tests to --forked due to terminal test conflicts Terminal tests use shared paths (/tmp/test_dir) and subprocess management that conflicts when run in parallel with pytest-xdist. Co-authored-by: openhands --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 37e6adc06a..4bb55c167f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -114,10 +114,10 @@ jobs: run: | # Clean up any existing coverage file rm -f .coverage - # Use pytest-xdist (-n auto) for parallel execution with proper - # coverage collection. --forked prevents coverage from child processes. + # Use --forked for tools tests due to terminal test conflicts + # when running in parallel (shared /tmp paths, subprocess management) CI=true uv run python -m pytest -vvs \ - -n auto \ + --forked \ --cov=openhands-tools \ --cov-report=term-missing \ --cov-fail-under=0 \ From 00d062d1a2ad6d7c32f4c2b2b69bafb563841e52 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 9 Jan 2026 07:10:37 +0000 Subject: [PATCH 8/8] ci: revert tools-tests timeout back to 15 minutes Co-authored-by: openhands --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4bb55c167f..5031d7106a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -82,7 +82,7 @@ jobs: tools-tests: runs-on: blacksmith-2vcpu-ubuntu-2404 - timeout-minutes: 25 + timeout-minutes: 15 steps: - name: Checkout uses: actions/checkout@v5