-
Notifications
You must be signed in to change notification settings - Fork 109
Fix verify method to include builtin tools in event check #1710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -384,6 +384,13 @@ def verify( | |||||||||||||||||||||||||||||||
| if isinstance(event, ActionEvent) and event.tool_name | ||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| # Add builtin tool names from include_default_tools | ||||||||||||||||||||||||||||||||
| # These are runtime names like 'finish', 'think' | ||||||||||||||||||||||||||||||||
| for tool_class_name in self.include_default_tools: | ||||||||||||||||||||||||||||||||
| tool_class = BUILT_IN_TOOL_CLASSES.get(tool_class_name) | ||||||||||||||||||||||||||||||||
| if tool_class is not None: | ||||||||||||||||||||||||||||||||
| runtime_names.add(tool_class.name) | ||||||||||||||||||||||||||||||||
|
Comment on lines
+387
to
+392
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Suggestion: The fix is correct, but consider adding a comment about naming consistency. The
This works because events only contain runtime names, but it's conceptually inconsistent. A clarifying comment would help future maintainers:
Suggested change
|
||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| # Only require tools that were actually used in history. | ||||||||||||||||||||||||||||||||
| missing_used_tools = used_tools - runtime_names | ||||||||||||||||||||||||||||||||
| if missing_used_tools: | ||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1165,3 +1165,150 @@ def test_conversation_state_cipher_mismatch(): | |
| "API_KEY" | ||
| ].get_value() | ||
| assert api_key_value is None | ||
|
|
||
|
|
||
| def test_agent_verify_builtin_tools_included_in_check(): | ||
| """Test that verify() correctly includes builtin tools in its check. | ||
|
|
||
| Builtin tools configured via include_default_tools are now correctly | ||
| recognized by their runtime names (e.g., 'finish', 'think') when | ||
| checking against event history. | ||
| """ | ||
| from openhands.sdk.agent import AgentBase | ||
| from openhands.sdk.event import ActionEvent | ||
| from openhands.sdk.llm import MessageToolCall | ||
| from openhands.sdk.tool import Tool | ||
|
|
||
| llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") | ||
|
|
||
| # Create persisted agent with TerminalTool | ||
| persisted_agent_obj = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="TerminalTool")], | ||
| include_default_tools=["FinishTool"], | ||
| ) | ||
|
|
||
| # Events from the session - only 'finish' was used (a builtin tool) | ||
| events_with_finish = [ | ||
| ActionEvent( | ||
| source="agent", | ||
| thought=[], | ||
| tool_name="finish", # Runtime name, NOT 'FinishTool' | ||
| tool_call_id="call_123", | ||
| tool_call=MessageToolCall( | ||
| id="call_123", | ||
| name="finish", | ||
| arguments='{"message": "Done!"}', | ||
| origin="completion", | ||
| ), | ||
| llm_response_id="resp_123", | ||
| ), | ||
| ] | ||
|
|
||
| # Serialize and deserialize to simulate loading from persistence | ||
| serialized = persisted_agent_obj.model_dump_json() | ||
| persisted_agent = AgentBase.model_validate_json(serialized) | ||
|
|
||
| # Create a runtime agent with DIFFERENT tools (FileEditorTool instead of | ||
| # TerminalTool) but still including FinishTool builtin | ||
| runtime_agent = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="FileEditorTool")], # Different from persisted! | ||
| include_default_tools=["FinishTool"], | ||
| ) | ||
|
|
||
| # This should PASS since 'finish' is a builtin tool available via | ||
| # include_default_tools. The verify method adds builtin tool runtime names | ||
| # to the check. | ||
| result = runtime_agent.verify(persisted_agent, events=events_with_finish) | ||
| assert result is runtime_agent | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟠 Important: Missing test coverage for the early return bug. All three tests have different tool specs between persisted and runtime agents (TerminalTool vs FileEditorTool), so they never trigger the early return at line 375-376 of base.py. Missing scenario that exposes the bug:
This case currently has a bug where verification incorrectly passes (see review body). Suggested additional test: def test_agent_verify_same_tools_different_builtins_fails():
"""Test that verify fails when tools match but required builtin is missing."""
llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
# Persisted agent has FinishTool
persisted_agent_obj = Agent(
llm=llm,
tools=[Tool(name="TerminalTool")],
include_default_tools=["FinishTool"],
)
# Events show 'finish' was used
events_with_finish = [
ActionEvent(
source="agent",
thought=[],
tool_name="finish",
tool_call_id="call_123",
tool_call=MessageToolCall(
id="call_123",
name="finish",
arguments='{"message": "Done!"}',
origin="completion",
),
llm_response_id="resp_123",
),
]
serialized = persisted_agent_obj.model_dump_json()
persisted_agent = AgentBase.model_validate_json(serialized)
# Runtime agent MISSING FinishTool
runtime_agent = Agent(
llm=llm,
tools=[Tool(name="TerminalTool")], # Same tool!
include_default_tools=[], # Missing FinishTool!
)
# Should fail but currently passes due to early return bug
with pytest.raises(ValueError, match="missing from runtime.*finish"):
runtime_agent.verify(persisted_agent, events=events_with_finish)
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟠 Important: Missing test coverage for the early return bug. All three tests have different tool specs between persisted and runtime agents (TerminalTool vs FileEditorTool), so they never trigger the early return at line 375-376 of base.py. Missing scenario that exposes the bug:
This case currently has a bug where verification incorrectly passes (see review body). Suggested additional test: def test_agent_verify_same_tools_different_builtins_fails():
"""Test that verify fails when tools match but required builtin is missing."""
llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm")
# Persisted agent has FinishTool
persisted_agent_obj = Agent(
llm=llm,
tools=[Tool(name="TerminalTool")],
include_default_tools=["FinishTool"],
)
# Events show 'finish' was used
events_with_finish = [
ActionEvent(
source="agent",
thought=[],
tool_name="finish",
tool_call_id="call_123",
tool_call=MessageToolCall(
id="call_123",
name="finish",
arguments='{"message": "Done!"}',
origin="completion",
),
llm_response_id="resp_123",
),
]
serialized = persisted_agent_obj.model_dump_json()
persisted_agent = AgentBase.model_validate_json(serialized)
# Runtime agent MISSING FinishTool
runtime_agent = Agent(
llm=llm,
tools=[Tool(name="TerminalTool")], # Same tool!
include_default_tools=[], # Missing FinishTool!
)
# Should fail but currently passes due to early return bug
with pytest.raises(ValueError, match="missing from runtime.*finish"):
runtime_agent.verify(persisted_agent, events=events_with_finish) |
||
|
|
||
| def test_agent_verify_think_builtin_tool_included(): | ||
| """Test that 'think' builtin tool is correctly included in event check.""" | ||
| from openhands.sdk.agent import AgentBase | ||
| from openhands.sdk.event import ActionEvent | ||
| from openhands.sdk.llm import MessageToolCall | ||
| from openhands.sdk.tool import Tool | ||
|
|
||
| llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") | ||
|
|
||
| persisted_agent_obj = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="TerminalTool")], | ||
| include_default_tools=["ThinkTool"], | ||
| ) | ||
|
|
||
| events_with_think = [ | ||
| ActionEvent( | ||
| source="agent", | ||
| thought=[], | ||
| tool_name="think", | ||
| tool_call_id="call_123", | ||
| tool_call=MessageToolCall( | ||
| id="call_123", | ||
| name="think", | ||
| arguments='{"thought": "Let me think..."}', | ||
| origin="completion", | ||
| ), | ||
| llm_response_id="resp_123", | ||
| ), | ||
| ] | ||
|
|
||
| serialized = persisted_agent_obj.model_dump_json() | ||
| persisted_agent = AgentBase.model_validate_json(serialized) | ||
|
|
||
| runtime_agent = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="FileEditorTool")], | ||
| include_default_tools=["ThinkTool"], | ||
| ) | ||
|
|
||
| result = runtime_agent.verify(persisted_agent, events=events_with_think) | ||
| assert result is runtime_agent | ||
|
|
||
|
|
||
| def test_agent_verify_missing_builtin_tool_fails(): | ||
| """Test that verify fails when a used builtin tool is not configured.""" | ||
| from openhands.sdk.agent import AgentBase | ||
| from openhands.sdk.event import ActionEvent | ||
| from openhands.sdk.llm import MessageToolCall | ||
| from openhands.sdk.tool import Tool | ||
|
|
||
| llm = LLM(model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm") | ||
|
|
||
| persisted_agent_obj = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="TerminalTool")], | ||
| include_default_tools=["FinishTool"], # Has FinishTool | ||
| ) | ||
|
|
||
| events_with_finish = [ | ||
| ActionEvent( | ||
| source="agent", | ||
| thought=[], | ||
| tool_name="finish", | ||
| tool_call_id="call_123", | ||
| tool_call=MessageToolCall( | ||
| id="call_123", | ||
| name="finish", | ||
| arguments='{"message": "Done!"}', | ||
| origin="completion", | ||
| ), | ||
| llm_response_id="resp_123", | ||
| ), | ||
| ] | ||
|
|
||
| serialized = persisted_agent_obj.model_dump_json() | ||
| persisted_agent = AgentBase.model_validate_json(serialized) | ||
|
|
||
| # Runtime agent does NOT have FinishTool in include_default_tools | ||
| runtime_agent = Agent( | ||
| llm=llm, | ||
| tools=[Tool(name="FileEditorTool")], | ||
| include_default_tools=[], # No FinishTool! | ||
| ) | ||
|
|
||
| # Should fail because 'finish' was used but FinishTool is not configured | ||
| with pytest.raises(ValueError, match="missing from runtime.*finish"): | ||
| runtime_agent.verify(persisted_agent, events=events_with_finish) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🟡 Suggestion: The fix is correct, but consider adding a comment about naming consistency.
The
runtime_namesset now contains a mix of:"TerminalTool"(from line 372)"finish","think"(added here)This works because events only contain runtime names, but it's conceptually inconsistent. A clarifying comment would help future maintainers: