Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions environments/remote_browser/src/hud_controller/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
AnthropicComputerTool,
OpenAIComputerTool,
HudComputerTool,
GeminiComputerTool,
)

# Import setup and evaluate hubs
Expand Down Expand Up @@ -283,6 +284,7 @@ async def send_progress(progress: int, message: str):
mcp.add_tool(HudComputerTool(executor=browser_executor))
mcp.add_tool(AnthropicComputerTool(executor=browser_executor))
mcp.add_tool(OpenAIComputerTool(executor=browser_executor))
mcp.add_tool(GeminiComputerTool(executor=browser_executor))

await send_progress(80, "Registered hud computer tools")

Expand Down
26 changes: 26 additions & 0 deletions environments/remote_browser/src/hud_controller/tools/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ def __init__(self, playwright_tool, display_num: int | None = None):
self.playwright_tool = playwright_tool
logger.info("BrowserExecutor initialized with Playwright backend")

async def _current_url(self) -> str | None:
"""Return current page URL if available."""
try:
page = await self._ensure_page()
return page.url
except Exception:
return None

def _map_key(self, key: str) -> str:
"""Map a key name to Playwright format."""
key = key.strip()
Expand Down Expand Up @@ -172,6 +180,9 @@ async def click(
logger.debug(f"Clicked at ({x}, {y}) with button {button}")

result = ContentResult(output=f"Clicked at ({x}, {y})")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down Expand Up @@ -213,6 +224,9 @@ async def write(
logger.debug(f"Typed text: {text[:50]}...")

result = ContentResult(output=f"Typed: {text}")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down Expand Up @@ -253,6 +267,9 @@ async def press(
logger.debug(f"Pressed keys: {keys} (mapped to: {mapped_keys})")

result = ContentResult(output=f"Pressed: {key_combination}")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down Expand Up @@ -292,6 +309,9 @@ async def scroll(
logger.debug(f"Scrolled at ({x}, {y}) by ({delta_x}, {delta_y})")

result = ContentResult(output=f"Scrolled by ({delta_x}, {delta_y})")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down Expand Up @@ -319,6 +339,9 @@ async def move(
logger.debug(f"Moved mouse to ({x}, {y})")

result = ContentResult(output=f"Moved to ({x}, {y})")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down Expand Up @@ -369,6 +392,9 @@ async def drag(
logger.debug(f"Dragged from {path[0]} through {len(path)} points")

result = ContentResult(output=f"Dragged through {len(path)} points")
current = await self._current_url()
if current:
result.url = current
if take_screenshot:
result = result + ContentResult(base64_image=await self.screenshot())

Expand Down
104 changes: 104 additions & 0 deletions examples/gemini_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""
Gemini Agent Example (Remote Browser)

This example showcases Gemini-specific features against a remote browser environment:
- Computer use capabilities with normalized coordinates
- Browser automation
- Multi-step reasoning tasks

Gemini uses a normalized coordinate system (0-999) that is automatically
scaled to actual screen dimensions.
"""

import asyncio

import hud
import os
from hud.agents import GeminiAgent
from hud.clients import MCPClient
from hud.settings import settings


async def main():
with hud.trace("Gemini Agent Demo"):
# Remote HUD MCP server using your custom remote-browser image
# Built via environments/remote_browser/Dockerfile
# Build headers with required environment for remote browser
provider = os.getenv("BROWSER_PROVIDER", "anchorbrowser")
headers = {
"Authorization": f"Bearer {settings.api_key}",
"Mcp-Image": "alberthu233/hud-remote-browser:gemini-dev-2",
"Env-Browser-Provider": provider,
}

# Optionally pass provider-specific API key if available
provider_key_map = {
"anchorbrowser": "ANCHOR_API_KEY",
"steel": "STEEL_API_KEY",
"browserbase": "BROWSERBASE_API_KEY",
"hyperbrowser": "HYPERBROWSER_API_KEY",
"kernel": "KERNEL_API_KEY",
}
if provider in provider_key_map:
key_var = provider_key_map[provider]
key_val = os.getenv(key_var)
if key_val:
header_key = f"Env-{'-'.join(part.capitalize() for part in key_var.split('_'))}"
headers[header_key] = key_val

mcp_config = {"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": headers}}

# Create Gemini-specific agent
client = MCPClient(mcp_config=mcp_config)
agent = GeminiAgent(
mcp_client=client,
model="gemini-2.5-computer-use-preview-10-2025",
allowed_tools=["gemini_computer"],
initial_screenshot=True,
temperature=1.0,
max_output_tokens=8192,
)

await client.initialize()

try:
initial_url = "https://httpbin.org/forms/post"

prompt = f"""
Please help me fill out a web form one step at a time:
1. Navigate to {initial_url}
2. Fill in the customer name as "Gemini Test"
3. Enter the telephone as "555-0456"
4. Enter the email as "[email protected]"
5. Type "Submission with Gemini" in the comments
6. Select a medium pizza size
7. Choose "mushroom" as a topping
8. Set delivery time to "16:00"
9. Submit the form
10. Verify the submission was successful
"""

print("📋 Task: Multi-step form interaction (Remote Browser)")
print("🚀 Running Gemini agent...\n")

# Setup: navigate to initial URL via setup tool
await client.call_tool(
name="setup",
arguments={"name": "navigate_to_url", "arguments": {"url": initial_url}},
)

# Run the prompt
result = await agent.run(prompt, max_steps=50)

print(result)

finally:
await client.shutdown()

print("\n✨ Gemini agent demo complete!")


if __name__ == "__main__":
asyncio.run(main())

2 changes: 2 additions & 0 deletions hud/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@

from .base import MCPAgent
from .claude import ClaudeAgent
from .gemini import GeminiAgent
from .openai import OperatorAgent
from .openai_chat_generic import GenericOpenAIChatAgent

__all__ = [
"ClaudeAgent",
"GeminiAgent",
"GenericOpenAIChatAgent",
"MCPAgent",
"OperatorAgent",
Expand Down
Loading
Loading