From 6819bffb817718902e663448065690f1c2222f4d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 00:25:42 +0000
Subject: [PATCH 1/8] fix(tests): Move local test classes to module level for
 pytest-xdist compatibility

Move local class definitions from inside test functions to module level to
prevent test pollution when running tests in parallel with pytest-xdist.

When classes like DifferentAgent, TestAction, etc. are defined inside test
functions, they get registered with Pydantic's discriminated union system but
are not importable, causing failures in other tests running in the same worker.

Files changed:
- test_state_serialization.py: Move _DifferentAgentForVerifyTest to module level
- test_reasoning_content.py: Move _TestActionForReasoningContent to module level
- test_schema_immutability.py: Move _SchemaImmutabilityCustomAction and
  _SchemaImmutabilityCustomObservation to module level

This fixes test pollution that was masked by --forked flag, enabling the
switch to pytest-xdist for accurate coverage reporting.

Fixes #1656

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../local/test_state_serialization.py         | 61 +++++++++++--------
 tests/sdk/llm/test_reasoning_content.py       | 20 ++++--
 tests/sdk/tool/test_schema_immutability.py    | 41 +++++++++----
 3 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py
index eaee970b08..47a82fa9b9 100644
--- a/tests/sdk/conversation/local/test_state_serialization.py
+++ b/tests/sdk/conversation/local/test_state_serialization.py
@@ -9,11 +9,16 @@
 from pydantic import SecretStr, ValidationError
 
 from openhands.sdk import Agent, Conversation
+from openhands.sdk.agent.base import AgentBase
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.conversation.state import (
     ConversationExecutionStatus,
     ConversationState,
 )
+from openhands.sdk.conversation.types import (
+    ConversationCallbackType,
+    ConversationTokenCallbackType,
+)
 from openhands.sdk.event.llm_convertible import MessageEvent, SystemPromptEvent
 from openhands.sdk.llm import LLM, Message, TextContent
 from openhands.sdk.llm.llm_registry import RegistryEvent
@@ -21,6 +26,34 @@
 from openhands.sdk.workspace import LocalWorkspace
 
 
+class _DifferentAgentForVerifyTest(AgentBase):
+    """A different agent class used to test Agent.verify() rejects class mismatches.
+
+    This class is defined at module level (rather than inside a test function) to ensure
+    it's importable by Pydantic during serialization/deserialization. Defining it inside
+    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    """
+
+    def __init__(self):
+        llm = LLM(
+            model="gpt-4o-mini",
+            api_key=SecretStr("test-key"),
+            usage_id="test-llm",
+        )
+        super().__init__(llm=llm, tools=[])
+
+    def init_state(self, state, on_event):
+        pass
+
+    def step(
+        self,
+        conversation,
+        on_event: ConversationCallbackType,
+        on_token: ConversationTokenCallbackType | None = None,
+    ):
+        pass
+
+
 def test_conversation_state_basic_serialization():
     """Test basic ConversationState serialization and deserialization."""
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
@@ -490,35 +523,9 @@ def test_agent_verify_allows_different_llm():
 
 def test_agent_verify_different_class_raises_error():
     """Test that agent.verify() raises error for different agent classes."""
-    from openhands.sdk.agent.base import AgentBase
-    from openhands.sdk.conversation.types import (
-        ConversationCallbackType,
-        ConversationTokenCallbackType,
-    )
-
-    class DifferentAgent(AgentBase):
-        def __init__(self):
-            llm = LLM(
-                model="gpt-4o-mini",
-                api_key=SecretStr("test-key"),
-                usage_id="test-llm",
-            )
-            super().__init__(llm=llm, tools=[])
-
-        def init_state(self, state, on_event):
-            pass
-
-        def step(
-            self,
-            conversation,
-            on_event: ConversationCallbackType,
-            on_token: ConversationTokenCallbackType | None = None,
-        ):
-            pass
-
     llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
     original_agent = Agent(llm=llm, tools=[])
-    different_agent = DifferentAgent()
+    different_agent = _DifferentAgentForVerifyTest()
 
     with pytest.raises(ValueError, match="Cannot load from persisted"):
         original_agent.verify(different_agent)
diff --git a/tests/sdk/llm/test_reasoning_content.py b/tests/sdk/llm/test_reasoning_content.py
index 41cedcbfb9..0fd5ddccbd 100644
--- a/tests/sdk/llm/test_reasoning_content.py
+++ b/tests/sdk/llm/test_reasoning_content.py
@@ -2,6 +2,19 @@
 
 from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse, Usage
 
+from openhands.sdk.tool import Action
+
+
+class _TestActionForReasoningContent(Action):
+    """A test action used for testing reasoning content in ActionEvent.
+
+    This class is defined at module level (rather than inside a test function) to ensure
+    it's importable by Pydantic during serialization/deserialization. Defining it inside
+    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    """
+
+    action: str = "test"
+
 
 def create_mock_response(content: str = "Test response", response_id: str = "test-id"):
     """Helper function to create properly structured mock responses."""
@@ -113,11 +126,6 @@ def test_action_event_with_reasoning_content():
         MessageToolCall,
         TextContent,
     )
-    from openhands.sdk.tool import Action
-
-    # Create a simple action for testing
-    class TestAction(Action):
-        action: str = "test"
 
     # Create a tool call
     tool_call = MessageToolCall(
@@ -129,7 +137,7 @@ class TestAction(Action):
 
     action_event = ActionEvent(
         thought=[TextContent(text="I need to test this")],
-        action=TestAction(),
+        action=_TestActionForReasoningContent(),
         tool_name="test_tool",
         tool_call_id="test-id",
         tool_call=tool_call,
diff --git a/tests/sdk/tool/test_schema_immutability.py b/tests/sdk/tool/test_schema_immutability.py
index d6df9d129a..8a9431a448 100644
--- a/tests/sdk/tool/test_schema_immutability.py
+++ b/tests/sdk/tool/test_schema_immutability.py
@@ -53,6 +53,32 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         return [TextContent(text=f"Result: {self.result}, Status: {self.status}")]
 
 
+class _SchemaImmutabilityCustomAction(Action):
+    """Custom action for testing schema inheritance immutability.
+
+    This class is defined at module level (rather than inside a test function) to ensure
+    it's importable by Pydantic during serialization/deserialization. Defining it inside
+    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    """
+
+    custom_field: str = Field(description="Custom field")
+
+
+class _SchemaImmutabilityCustomObservation(Observation):
+    """Custom observation for testing schema inheritance immutability.
+
+    This class is defined at module level (rather than inside a test function) to ensure
+    it's importable by Pydantic during serialization/deserialization. Defining it inside
+    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    """
+
+    custom_result: str = Field(description="Custom result")
+
+    @property
+    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        return [TextContent(text=self.custom_result)]
+
+
 def test_schema_is_frozen():
     """Test that Schema instances are frozen and cannot be modified."""
     schema = MockSchema(name="test", value=42)
@@ -273,22 +299,11 @@ def test_all_schema_classes_are_frozen():
 
 def test_schema_inheritance_preserves_immutability():
     """Test that classes inheriting from schema bases are also immutable."""
-
-    class SchemaImmutabilityCustomAction(Action):
-        custom_field: str = Field(description="Custom field")
-
-    class SchemaImmutabilityCustomObservation(Observation):
-        custom_result: str = Field(description="Custom result")
-
-        @property
-        def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-            return [TextContent(text=self.custom_result)]
-
     # Test that custom classes are also frozen
-    custom_action = SchemaImmutabilityCustomAction(custom_field="test")
+    custom_action = _SchemaImmutabilityCustomAction(custom_field="test")
     with pytest.raises(ValidationError, match="Instance is frozen"):
         custom_action.custom_field = "changed"
 
-    custom_obs = SchemaImmutabilityCustomObservation(custom_result="test")
+    custom_obs = _SchemaImmutabilityCustomObservation(custom_result="test")
     with pytest.raises(ValidationError, match="Instance is frozen"):
         custom_obs.custom_result = "changed"

From eaf9ddeab0d2c13302409c4247808ec6557ee23b Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 00:33:01 +0000
Subject: [PATCH 2/8] fix: wrap long lines in docstrings to pass ruff lint

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../conversation/local/test_state_serialization.py |  7 ++++---
 tests/sdk/llm/test_reasoning_content.py            |  7 ++++---
 tests/sdk/tool/test_schema_immutability.py         | 14 ++++++++------
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/tests/sdk/conversation/local/test_state_serialization.py b/tests/sdk/conversation/local/test_state_serialization.py
index 47a82fa9b9..fd6fe2765d 100644
--- a/tests/sdk/conversation/local/test_state_serialization.py
+++ b/tests/sdk/conversation/local/test_state_serialization.py
@@ -29,9 +29,10 @@
 class _DifferentAgentForVerifyTest(AgentBase):
     """A different agent class used to test Agent.verify() rejects class mismatches.
 
-    This class is defined at module level (rather than inside a test function) to ensure
-    it's importable by Pydantic during serialization/deserialization. Defining it inside
-    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
     """
 
     def __init__(self):
diff --git a/tests/sdk/llm/test_reasoning_content.py b/tests/sdk/llm/test_reasoning_content.py
index 0fd5ddccbd..39d6dddb7a 100644
--- a/tests/sdk/llm/test_reasoning_content.py
+++ b/tests/sdk/llm/test_reasoning_content.py
@@ -8,9 +8,10 @@
 class _TestActionForReasoningContent(Action):
     """A test action used for testing reasoning content in ActionEvent.
 
-    This class is defined at module level (rather than inside a test function) to ensure
-    it's importable by Pydantic during serialization/deserialization. Defining it inside
-    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
     """
 
     action: str = "test"
diff --git a/tests/sdk/tool/test_schema_immutability.py b/tests/sdk/tool/test_schema_immutability.py
index 8a9431a448..eff2aca134 100644
--- a/tests/sdk/tool/test_schema_immutability.py
+++ b/tests/sdk/tool/test_schema_immutability.py
@@ -56,9 +56,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 class _SchemaImmutabilityCustomAction(Action):
     """Custom action for testing schema inheritance immutability.
 
-    This class is defined at module level (rather than inside a test function) to ensure
-    it's importable by Pydantic during serialization/deserialization. Defining it inside
-    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
     """
 
     custom_field: str = Field(description="Custom field")
@@ -67,9 +68,10 @@ class _SchemaImmutabilityCustomAction(Action):
 class _SchemaImmutabilityCustomObservation(Observation):
     """Custom observation for testing schema inheritance immutability.
 
-    This class is defined at module level (rather than inside a test function) to ensure
-    it's importable by Pydantic during serialization/deserialization. Defining it inside
-    a test function causes test pollution when running tests in parallel with pytest-xdist.
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
     """
 
     custom_result: str = Field(description="Custom result")

From 6573d5d03662266cbbeed7975b522e1af46e4702 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 05:36:46 +0000
Subject: [PATCH 3/8] ci: Replace --forked with pytest-xdist for accurate
 coverage

The --forked flag prevents coverage collection from child processes,
resulting in artificially low coverage reports (near 0% in some cases).

Switching to pytest-xdist (-n auto) provides:
- Parallel test execution (same benefit as --forked)
- Proper coverage collection from all worker processes
- Better test isolation through process separation

This change affects sdk-tests, tools-tests, and agent-server-tests.

Closes #1656

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/tests.yml | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index a125d720f8..dbb70ab60e 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -57,8 +57,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use pytest-xdist (-n auto) for parallel execution with proper
+                  # coverage collection. --forked prevents coverage from child processes.
                   CI=true uv run python -m pytest -vvs \
-                    --forked \
+                    -n auto \
                     --cov=openhands-sdk \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \
@@ -112,8 +114,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use pytest-xdist (-n auto) for parallel execution with proper
+                  # coverage collection. --forked prevents coverage from child processes.
                   CI=true uv run python -m pytest -vvs \
-                    --forked \
+                    -n auto \
                     --cov=openhands-tools \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \
@@ -166,8 +170,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
+                  # Use pytest-xdist (-n auto) for parallel execution with proper
+                  # coverage collection. --forked prevents coverage from child processes.
                   CI=true uv run python -m pytest -vvs \
-                    --forked \
+                    -n auto \
                     --cov=openhands-agent-server \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \

From af48df2582f5ad9f83612158c507d682c0f53272 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 05:42:43 +0000
Subject: [PATCH 4/8] fix(tests): Move TestEvent to module level for
 pytest-xdist compatibility

Another local class in test_event_immutability.py was causing test pollution
when running tests in parallel. Moved _TestEventForImmutability to module level.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/sdk/event/test_event_immutability.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/sdk/event/test_event_immutability.py b/tests/sdk/event/test_event_immutability.py
index 8a231fd92d..4a8ece8742 100644
--- a/tests/sdk/event/test_event_immutability.py
+++ b/tests/sdk/event/test_event_immutability.py
@@ -75,13 +75,21 @@ def create(cls, *args, **kwargs) -> Sequence[Self]:
         ]
 
 
-def test_event_base_is_frozen():
-    """Test that Event instances are frozen and cannot be modified."""
+class _TestEventForImmutability(Event):
+    """Test event class for immutability tests.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
 
-    class TestEvent(Event):
-        test_field: str = "test_value"
+    test_field: str = "test_value"
 
-    event = TestEvent(source="agent", test_field="initial_value")
+
+def test_event_base_is_frozen():
+    """Test that Event instances are frozen and cannot be modified."""
+    event = _TestEventForImmutability(source="agent", test_field="initial_value")
 
     # Test that we cannot modify any field
     with pytest.raises(Exception):  # Pydantic raises ValidationError for frozen models

From 933b552194364938418a840d6c77653d14840d07 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 05:58:46 +0000
Subject: [PATCH 5/8] fix(tests): Move remaining local classes to module level
 for pytest-xdist

Additional local classes causing test pollution when running in parallel:
- _UnknownEventForVisualizerTest in test_visualizer.py
- _NestedActionForMalformedArgs in test_fix_malformed_tool_arguments.py
- _ChildMCPToolActionForSerialization in test_mcp_action_serialization.py

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../test_fix_malformed_tool_arguments.py      | 22 +++++++++++------
 tests/sdk/conversation/test_visualizer.py     | 24 ++++++++++++-------
 .../sdk/mcp/test_mcp_action_serialization.py  | 18 ++++++++++----
 3 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/tests/sdk/agent/test_fix_malformed_tool_arguments.py b/tests/sdk/agent/test_fix_malformed_tool_arguments.py
index 6a2ef28e94..9d5dda5e66 100644
--- a/tests/sdk/agent/test_fix_malformed_tool_arguments.py
+++ b/tests/sdk/agent/test_fix_malformed_tool_arguments.py
@@ -44,6 +44,19 @@ class JsonDecodingOptionalAction(Action):
     config: dict[str, int] | None = Field(default=None, description="Optional dict")
 
 
+class _NestedActionForMalformedArgs(Action):
+    """Action with nested structures for testing JSON decoding.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    nested_list: list[list[int]] = Field(description="Nested list")
+    nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict")
+
+
 def test_decode_json_string_list():
     """Test that JSON string lists are decoded to native lists."""
     data = {
@@ -201,17 +214,12 @@ def test_json_string_with_wrong_type_rejected():
 
 def test_nested_structures():
     """Test that nested lists and dicts in JSON strings work."""
-
-    class NestedAction(Action):
-        nested_list: list[list[int]] = Field(description="Nested list")
-        nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict")
-
     data = {
         "nested_list": "[[1, 2], [3, 4]]",
         "nested_dict": '{"outer": {"inner": "value"}}',
     }
-    fixed_data = fix_malformed_tool_arguments(data, NestedAction)
-    action = NestedAction.model_validate(fixed_data)
+    fixed_data = fix_malformed_tool_arguments(data, _NestedActionForMalformedArgs)
+    action = _NestedActionForMalformedArgs.model_validate(fixed_data)
 
     assert action.nested_list == [[1, 2], [3, 4]]
     assert action.nested_dict == {"outer": {"inner": "value"}}
diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py
index c88544ac1f..373bb01eeb 100644
--- a/tests/sdk/conversation/test_visualizer.py
+++ b/tests/sdk/conversation/test_visualizer.py
@@ -21,6 +21,8 @@
     SystemPromptEvent,
     UserRejectObservation,
 )
+from openhands.sdk.event.base import Event
+from openhands.sdk.event.types import SourceType
 from openhands.sdk.llm import (
     Message,
     MessageToolCall,
@@ -33,6 +35,18 @@
     from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 
 
+class _UnknownEventForVisualizerTest(Event):
+    """Unknown event type for testing fallback visualization.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    source: SourceType = "agent"
+
+
 class VisualizerMockAction(Action):
     """Mock action for testing."""
 
@@ -457,18 +471,12 @@ def test_metrics_abbreviation_formatting():
 
 def test_event_base_fallback_visualize():
     """Test that Event provides fallback visualization."""
-    from openhands.sdk.event.base import Event
-    from openhands.sdk.event.types import SourceType
-
-    class UnknownEvent(Event):
-        source: SourceType = "agent"
-
-    event = UnknownEvent()
+    event = _UnknownEventForVisualizerTest()
     result = event.visualize
     assert isinstance(result, Text)
 
     text_content = result.plain
-    assert "Unknown event type: UnknownEvent" in text_content
+    assert "Unknown event type: _UnknownEventForVisualizerTest" in text_content
 
 
 def test_visualizer_conversation_state_update_event_skipped():
diff --git a/tests/sdk/mcp/test_mcp_action_serialization.py b/tests/sdk/mcp/test_mcp_action_serialization.py
index 288953b50f..502686470c 100644
--- a/tests/sdk/mcp/test_mcp_action_serialization.py
+++ b/tests/sdk/mcp/test_mcp_action_serialization.py
@@ -4,6 +4,18 @@
 from openhands.sdk.mcp import MCPToolAction
 
 
+class _ChildMCPToolActionForSerialization(MCPToolAction):
+    """Child MCP action for testing declared fields with data.
+
+    This class is defined at module level (rather than inside a test function) to
+    ensure it's importable by Pydantic during serialization/deserialization.
+    Defining it inside a test function causes test pollution when running tests
+    in parallel with pytest-xdist.
+    """
+
+    declared: int
+
+
 def test_data_field_emerges_from_to_mcp_arguments():
     """Test that data field contents are returned by to_mcp_arguments."""
     data = {"new_field": "value", "dynamic": 123}
@@ -18,12 +30,8 @@ def test_data_field_emerges_from_to_mcp_arguments():
 
 def test_declared_child_fields_with_data():
     """Test that child classes work with the data field."""
-
-    class Child(MCPToolAction):
-        declared: int
-
     data = {"tool_param": "value"}
-    a = Child(declared=7, data=data)
+    a = _ChildMCPToolActionForSerialization(declared=7, data=data)
     out = a.to_mcp_arguments()
 
     # Only data field contents should be in MCP arguments

From f6af5c2a1980d10ebb19af921fc50b9dbce2cd4b Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 06:51:37 +0000
Subject: [PATCH 6/8] ci: bump tools-tests timeout to 25 minutes

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index dbb70ab60e..37e6adc06a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -82,7 +82,7 @@ jobs:
 
     tools-tests:
         runs-on: blacksmith-2vcpu-ubuntu-2404
-        timeout-minutes: 15
+        timeout-minutes: 25
         steps:
             - name: Checkout
               uses: actions/checkout@v5

From 911246dcf328a21fd1ae7565aa6eb4c316b7a481 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 06:59:05 +0000
Subject: [PATCH 7/8] ci: revert tools-tests to --forked due to terminal test
 conflicts

Terminal tests use shared paths (/tmp/test_dir) and subprocess management
that conflicts when run in parallel with pytest-xdist.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 37e6adc06a..4bb55c167f 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -114,10 +114,10 @@ jobs:
               run: |
                   # Clean up any existing coverage file
                   rm -f .coverage
-                  # Use pytest-xdist (-n auto) for parallel execution with proper
-                  # coverage collection. --forked prevents coverage from child processes.
+                  # Use --forked for tools tests due to terminal test conflicts
+                  # when running in parallel (shared /tmp paths, subprocess management)
                   CI=true uv run python -m pytest -vvs \
-                    -n auto \
+                    --forked \
                     --cov=openhands-tools \
                     --cov-report=term-missing \
                     --cov-fail-under=0 \

From 00d062d1a2ad6d7c32f4c2b2b69bafb563841e52 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 9 Jan 2026 07:10:37 +0000
Subject: [PATCH 8/8] ci: revert tools-tests timeout back to 15 minutes

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 4bb55c167f..5031d7106a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -82,7 +82,7 @@ jobs:
 
     tools-tests:
         runs-on: blacksmith-2vcpu-ubuntu-2404
-        timeout-minutes: 25
+        timeout-minutes: 15
         steps:
             - name: Checkout
               uses: actions/checkout@v5