hud-evals · alberthu233 · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025 · Oct 15, 2025
diff --git a/environments/remote_browser/src/hud_controller/server.py b/environments/remote_browser/src/hud_controller/server.py
@@ -25,6 +25,7 @@
     AnthropicComputerTool,
     OpenAIComputerTool,
     HudComputerTool,
+    GeminiComputerTool,
 )
 
 # Import setup and evaluate hubs
@@ -283,6 +284,7 @@ async def send_progress(progress: int, message: str):
         mcp.add_tool(HudComputerTool(executor=browser_executor))
         mcp.add_tool(AnthropicComputerTool(executor=browser_executor))
         mcp.add_tool(OpenAIComputerTool(executor=browser_executor))
+        mcp.add_tool(GeminiComputerTool(executor=browser_executor))
 
         await send_progress(80, "Registered hud computer tools")
 

diff --git a/environments/remote_browser/src/hud_controller/tools/executor.py b/environments/remote_browser/src/hud_controller/tools/executor.py
@@ -94,6 +94,14 @@ def __init__(self, playwright_tool, display_num: int | None = None):
         self.playwright_tool = playwright_tool
         logger.info("BrowserExecutor initialized with Playwright backend")
 
+    async def _current_url(self) -> str | None:
+        """Return current page URL if available."""
+        try:
+            page = await self._ensure_page()
+            return page.url
+        except Exception:
+            return None
+
     def _map_key(self, key: str) -> str:
         """Map a key name to Playwright format."""
         key = key.strip()
@@ -172,6 +180,9 @@ async def click(
             logger.debug(f"Clicked at ({x}, {y}) with button {button}")
 
             result = ContentResult(output=f"Clicked at ({x}, {y})")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 
@@ -213,6 +224,9 @@ async def write(
             logger.debug(f"Typed text: {text[:50]}...")
 
             result = ContentResult(output=f"Typed: {text}")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 
@@ -253,6 +267,9 @@ async def press(
             logger.debug(f"Pressed keys: {keys} (mapped to: {mapped_keys})")
 
             result = ContentResult(output=f"Pressed: {key_combination}")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 
@@ -292,6 +309,9 @@ async def scroll(
             logger.debug(f"Scrolled at ({x}, {y}) by ({delta_x}, {delta_y})")
 
             result = ContentResult(output=f"Scrolled by ({delta_x}, {delta_y})")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 
@@ -319,6 +339,9 @@ async def move(
             logger.debug(f"Moved mouse to ({x}, {y})")
 
             result = ContentResult(output=f"Moved to ({x}, {y})")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 
@@ -369,6 +392,9 @@ async def drag(
             logger.debug(f"Dragged from {path[0]} through {len(path)} points")
 
             result = ContentResult(output=f"Dragged through {len(path)} points")
+            current = await self._current_url()
+            if current:
+                result.url = current
             if take_screenshot:
                 result = result + ContentResult(base64_image=await self.screenshot())
 

diff --git a/examples/gemini_agent.py b/examples/gemini_agent.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Gemini Agent Example (Remote Browser)
+
+This example showcases Gemini-specific features against a remote browser environment:
+- Computer use capabilities with normalized coordinates
+- Browser automation
+- Multi-step reasoning tasks
+
+Gemini uses a normalized coordinate system (0-999) that is automatically
+scaled to actual screen dimensions.
+"""
+
+import asyncio
+
+import hud
+import os
+from hud.agents import GeminiAgent
+from hud.clients import MCPClient
+from hud.settings import settings
+
+
+async def main():
+    with hud.trace("Gemini Agent Demo"):
+        # Remote HUD MCP server using your custom remote-browser image
+        # Built via environments/remote_browser/Dockerfile
+        # Build headers with required environment for remote browser
+        provider = os.getenv("BROWSER_PROVIDER", "anchorbrowser")
+        headers = {
+            "Authorization": f"Bearer {settings.api_key}",
+            "Mcp-Image": "alberthu233/hud-remote-browser:gemini-dev-2",
+            "Env-Browser-Provider": provider,
+        }
+
+        # Optionally pass provider-specific API key if available
+        provider_key_map = {
+            "anchorbrowser": "ANCHOR_API_KEY",
+            "steel": "STEEL_API_KEY",
+            "browserbase": "BROWSERBASE_API_KEY",
+            "hyperbrowser": "HYPERBROWSER_API_KEY",
+            "kernel": "KERNEL_API_KEY",
+        }
+        if provider in provider_key_map:
+            key_var = provider_key_map[provider]
+            key_val = os.getenv(key_var)
+            if key_val:
+                header_key = f"Env-{'-'.join(part.capitalize() for part in key_var.split('_'))}"
+                headers[header_key] = key_val
+
+        mcp_config = {"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": headers}}
+
+        # Create Gemini-specific agent
+        client = MCPClient(mcp_config=mcp_config)
+        agent = GeminiAgent(
+            mcp_client=client,
+            model="gemini-2.5-computer-use-preview-10-2025",
+            allowed_tools=["gemini_computer"],
+            initial_screenshot=True,
+            temperature=1.0,
+            max_output_tokens=8192,
+        )
+
+        await client.initialize()
+
+        try:
+            initial_url = "https://httpbin.org/forms/post"
+
+            prompt = f"""
+            Please help me fill out a web form one step at a time:
+            1. Navigate to {initial_url}
+            2. Fill in the customer name as "Gemini Test"
+            3. Enter the telephone as "555-0456"
+            4. Enter the email as "[email protected]"
+            5. Type "Submission with Gemini" in the comments
+            6. Select a medium pizza size
+            7. Choose "mushroom" as a topping
+            8. Set delivery time to "16:00"
+            9. Submit the form
+            10. Verify the submission was successful
+            """
+
+            print("📋 Task: Multi-step form interaction (Remote Browser)")
+            print("🚀 Running Gemini agent...\n")
+
+            # Setup: navigate to initial URL via setup tool
+            await client.call_tool(
+                name="setup",
+                arguments={"name": "navigate_to_url", "arguments": {"url": initial_url}},
+            )
+
+            # Run the prompt
+            result = await agent.run(prompt, max_steps=50)
+
+            print(result)
+
+        finally:
+            await client.shutdown()
+
+    print("\n✨ Gemini agent demo complete!")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
@@ -2,11 +2,13 @@
 
 from .base import MCPAgent
 from .claude import ClaudeAgent
+from .gemini import GeminiAgent
 from .openai import OperatorAgent
 from .openai_chat_generic import GenericOpenAIChatAgent
 
 __all__ = [
     "ClaudeAgent",
+    "GeminiAgent",
     "GenericOpenAIChatAgent",
     "MCPAgent",
     "OperatorAgent",