OpenHands · xingyaoww · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -57,8 +57,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use pytest-xdist (-n auto) for parallel execution with proper
+                  # coverage collection. --forked prevents coverage from child processes.
                   CI=true uv run python -m pytest -vvs \
-                    --forked \
+                    -n auto \
                     --cov=openhands-sdk \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \
@@ -112,6 +114,8 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use --forked for tools tests due to terminal test conflicts
+                  # when running in parallel (shared /tmp paths, subprocess management)
                   CI=true uv run python -m pytest -vvs \
                     --forked \
                     --cov=openhands-tools \
@@ -166,8 +170,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use pytest-xdist (-n auto) for parallel execution with proper
+                  # coverage collection. --forked prevents coverage from child processes.
                   CI=true uv run python -m pytest -vvs \
-                    --forked \
+                    -n auto \
                     --cov=openhands-agent-server \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \

diff --git a/tests/sdk/agent/test_fix_malformed_tool_arguments.py b/tests/sdk/agent/test_fix_malformed_tool_arguments.py
@@ -44,6 +44,19 @@ class JsonDecodingOptionalAction(Action):
     config: dict[str, int] | None = Field(default=None, description="Optional dict")
 
 
+class _NestedActionForMalformedArgs(Action):
+    """Action with nested structures for testing JSON decoding.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    nested_list: list[list[int]] = Field(description="Nested list")
+    nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict")
+
+
 def test_decode_json_string_list():
     """Test that JSON string lists are decoded to native lists."""
     data = {
@@ -201,17 +214,12 @@ def test_json_string_with_wrong_type_rejected():
 
 def test_nested_structures():
     """Test that nested lists and dicts in JSON strings work."""
-
-    class NestedAction(Action):
-        nested_list: list[list[int]] = Field(description="Nested list")
-        nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict")
-
     data = {
         "nested_list": "[[1, 2], [3, 4]]",
         "nested_dict": '{"outer": {"inner": "value"}}',
     }
-    fixed_data = fix_malformed_tool_arguments(data, NestedAction)
-    action = NestedAction.model_validate(fixed_data)
+    fixed_data = fix_malformed_tool_arguments(data, _NestedActionForMalformedArgs)
+    action = _NestedActionForMalformedArgs.model_validate(fixed_data)
 
     assert action.nested_list == [[1, 2], [3, 4]]
     assert action.nested_dict == {"outer": {"inner": "value"}}

diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py
@@ -9,18 +9,52 @@
 from pydantic import SecretStr, ValidationError
 
 from openhands.sdk import Agent, Conversation
+from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.conversation.state import (
     ConversationExecutionStatus,
     ConversationState,
 )
+from openhands.sdk.conversation.types import (
+    ConversationCallbackType,
+    ConversationTokenCallbackType,
+)
 from openhands.sdk.event.llm_convertible import MessageEvent, SystemPromptEvent
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.llm.llm_registry import RegistryEvent
 from openhands.sdk.security.confirmation_policy import AlwaysConfirm
 from openhands.sdk.workspace import LocalWorkspace
 
 
+class _DifferentAgentForVerifyTest(AgentBase):
+    """A different agent class used to test Agent.verify() rejects class mismatches.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    def __init__(self):
+        llm = LLM(
+            model="gpt-4o-mini",
+            api_key=SecretStr("test-key"),
+            usage_id="test-llm",
+        )
+        super().__init__(llm=llm, tools=[])
+
+    def init_state(self, state, on_event):
+        pass
+
+    def step(
+        self,
+        conversation,
+        on_event: ConversationCallbackType,
+        on_token: ConversationTokenCallbackType | None = None,
+    ):
+        pass
+
+
 def test_conversation_state_basic_serialization():
     """Test basic ConversationState serialization and deserialization."""
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
@@ -490,35 +524,9 @@ def test_agent_verify_allows_different_llm():
 
 def test_agent_verify_different_class_raises_error():
     """Test that agent.verify() raises error for different agent classes."""
-    from openhands.sdk.agent.base import AgentBase
-    from openhands.sdk.conversation.types import (
-        ConversationCallbackType,
-        ConversationTokenCallbackType,
-    )
-
-    class DifferentAgent(AgentBase):
-        def __init__(self):
-            llm = LLM(
-                model="gpt-4o-mini",
-                api_key=SecretStr("test-key"),
-                usage_id="test-llm",
-            )
-            super().__init__(llm=llm, tools=[])
-
-        def init_state(self, state, on_event):
-            pass
-
-        def step(
-            self,
-            conversation,
-            on_event: ConversationCallbackType,
-            on_token: ConversationTokenCallbackType | None = None,
-        ):
-            pass
-
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
     original_agent = Agent(llm=llm, tools=[])
-    different_agent = DifferentAgent()
+    different_agent = _DifferentAgentForVerifyTest()
 
     with pytest.raises(ValueError, match="Cannot load from persisted"):
         original_agent.verify(different_agent)

diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py
@@ -21,6 +21,8 @@
     SystemPromptEvent,
     UserRejectObservation,
 )
+from openhands.sdk.event.base import Event
+from openhands.sdk.event.types import SourceType
 from openhands.sdk.llm import (
     Message,
     MessageToolCall,
@@ -33,6 +35,18 @@
     from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 
 
+class _UnknownEventForVisualizerTest(Event):
+    """Unknown event type for testing fallback visualization.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    source: SourceType = "agent"
+
+
 class VisualizerMockAction(Action):
     """Mock action for testing."""
 
@@ -457,18 +471,12 @@ def test_metrics_abbreviation_formatting():
 
 def test_event_base_fallback_visualize():
     """Test that Event provides fallback visualization."""
-    from openhands.sdk.event.base import Event
-    from openhands.sdk.event.types import SourceType
-
-    class UnknownEvent(Event):
-        source: SourceType = "agent"
-
-    event = UnknownEvent()
+    event = _UnknownEventForVisualizerTest()
     result = event.visualize
     assert isinstance(result, Text)
 
     text_content = result.plain
-    assert "Unknown event type: UnknownEvent" in text_content
+    assert "Unknown event type: _UnknownEventForVisualizerTest" in text_content
 
 
 def test_visualizer_conversation_state_update_event_skipped():

diff --git a/tests/sdk/event/test_event_immutability.py b/tests/sdk/event/test_event_immutability.py
@@ -75,13 +75,21 @@ def create(cls, *args, **kwargs) -> Sequence[Self]:
         ]
 
 
-def test_event_base_is_frozen():
-    """Test that Event instances are frozen and cannot be modified."""
+class _TestEventForImmutability(Event):
+    """Test event class for immutability tests.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
 
-    class TestEvent(Event):
-        test_field: str = "test_value"
+    test_field: str = "test_value"
 
-    event = TestEvent(source="agent", test_field="initial_value")
+
+def test_event_base_is_frozen():
+    """Test that Event instances are frozen and cannot be modified."""
+    event = _TestEventForImmutability(source="agent", test_field="initial_value")
 
     # Test that we cannot modify any field
     with pytest.raises(Exception):  # Pydantic raises ValidationError for frozen models

diff --git a/tests/sdk/llm/test_reasoning_content.py b/tests/sdk/llm/test_reasoning_content.py
@@ -2,6 +2,20 @@
 
 from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse, Usage
 
+from openhands.sdk.tool import Action
+
+
+class _TestActionForReasoningContent(Action):
+    """A test action used for testing reasoning content in ActionEvent.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    action: str = "test"
+
 
 def create_mock_response(content: str = "Test response", response_id: str = "test-id"):
     """Helper function to create properly structured mock responses."""
@@ -113,11 +127,6 @@ def test_action_event_with_reasoning_content():
         MessageToolCall,
         TextContent,
     )
-    from openhands.sdk.tool import Action
-
-    # Create a simple action for testing
-    class TestAction(Action):
-        action: str = "test"
 
     # Create a tool call
     tool_call = MessageToolCall(
@@ -129,7 +138,7 @@ class TestAction(Action):
 
     action_event = ActionEvent(
         thought=[TextContent(text="I need to test this")],
-        action=TestAction(),
+        action=_TestActionForReasoningContent(),
         tool_name="test_tool",
         tool_call_id="test-id",
         tool_call=tool_call,

diff --git a/tests/sdk/mcp/test_mcp_action_serialization.py b/tests/sdk/mcp/test_mcp_action_serialization.py
@@ -4,6 +4,18 @@
 from openhands.sdk.mcp import MCPToolAction
 
 
+class _ChildMCPToolActionForSerialization(MCPToolAction):
+    """Child MCP action for testing declared fields with data.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    declared: int
+
+
 def test_data_field_emerges_from_to_mcp_arguments():
     """Test that data field contents are returned by to_mcp_arguments."""
     data = {"new_field": "value", "dynamic": 123}
@@ -18,12 +30,8 @@ def test_data_field_emerges_from_to_mcp_arguments():
 
 def test_declared_child_fields_with_data():
     """Test that child classes work with the data field."""
-
-    class Child(MCPToolAction):
-        declared: int
-
     data = {"tool_param": "value"}
-    a = Child(declared=7, data=data)
+    a = _ChildMCPToolActionForSerialization(declared=7, data=data)
     out = a.to_mcp_arguments()
 
     # Only data field contents should be in MCP arguments

diff --git a/tests/sdk/tool/test_schema_immutability.py b/tests/sdk/tool/test_schema_immutability.py
@@ -53,6 +53,34 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return [TextContent(text=f"Result: {self.result}, Status: {self.status}")]
 
 
+class _SchemaImmutabilityCustomAction(Action):
+    """Custom action for testing schema inheritance immutability.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    custom_field: str = Field(description="Custom field")
+
+
+class _SchemaImmutabilityCustomObservation(Observation):
+    """Custom observation for testing schema inheritance immutability.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    custom_result: str = Field(description="Custom result")
+
+    @property
+    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        return [TextContent(text=self.custom_result)]
+
+
 def test_schema_is_frozen():
     """Test that Schema instances are frozen and cannot be modified."""
     schema = MockSchema(name="test", value=42)
@@ -273,22 +301,11 @@ def test_all_schema_classes_are_frozen():
 
 def test_schema_inheritance_preserves_immutability():
     """Test that classes inheriting from schema bases are also immutable."""
-
-    class SchemaImmutabilityCustomAction(Action):
-        custom_field: str = Field(description="Custom field")
-
-    class SchemaImmutabilityCustomObservation(Observation):
-        custom_result: str = Field(description="Custom result")
-
-        @property
-        def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-            return [TextContent(text=self.custom_result)]
-
     # Test that custom classes are also frozen
-    custom_action = SchemaImmutabilityCustomAction(custom_field="test")
+    custom_action = _SchemaImmutabilityCustomAction(custom_field="test")
     with pytest.raises(ValidationError, match="Instance is frozen"):
         custom_action.custom_field = "changed"
 
-    custom_obs = SchemaImmutabilityCustomObservation(custom_result="test")
+    custom_obs = _SchemaImmutabilityCustomObservation(custom_result="test")
     with pytest.raises(ValidationError, match="Instance is frozen"):
         custom_obs.custom_result = "changed"