hud-evals · pimpale · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/hud/agents/base.py b/hud/agents/base.py
@@ -537,7 +537,7 @@ def find_reward(result: MCPToolResult) -> float:
                 except json.JSONDecodeError:
                     pass
 
-    logger.error("Couldn't parse reward from result: %s", result)
+    logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
     return 0.0
 
 

diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -91,7 +91,7 @@ class AgentPreset:
 [eval]
 # source = "hud-evals/SheetBench-50"
 # agent = "claude"
-# full = false
+# all = false  # Run all problems instead of just 1
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
@@ -152,7 +152,7 @@ class EvalConfig(BaseModel):
         "source",
         "agent_type",
         "task_ids",
-        "full",
+        "all",
         "max_concurrent",
         "max_steps",
         "verbose",
@@ -171,12 +171,12 @@ class EvalConfig(BaseModel):
     agent_type: AgentType | None = None
     model: str | None = None
     task_ids: list[str] | None = None
-    full: bool = False
+    all: bool = False  # Run all problems instead of just 1
     max_concurrent: int = 30
-    max_steps: int | None = None
+    max_steps: int = 10
     verbose: bool = False
     very_verbose: bool = False
-    auto_respond: bool | None = None  # Continue without prompting (default: True for --full)
+    auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
@@ -454,12 +454,20 @@ def merge_cli(
 
         overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
 
-        for k in ("full", "verbose", "very_verbose", "remote", "quiet", "gateway"):
+        for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
             if cli_args.get(k) is True:
                 overrides[k] = True
             elif k in overrides and cli_args.get(k) is False:
                 del overrides[k]
 
+        # --full is a shortcut for --all --auto-respond --max-steps 100
+        if overrides.get("full"):
+            overrides["all"] = True
+            if "auto_respond" not in overrides:
+                overrides["auto_respond"] = True
+            if "max_steps" not in overrides:
+                overrides["max_steps"] = 100
+
         if config:
             merged_agent_config = dict(self.agent_config)
             for item in config:
@@ -541,15 +549,13 @@ def display(self) -> None:
             table.add_row(
                 "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
             )
-        table.add_row("full", str(self.full))
-        table.add_row("max_steps", str(self.max_steps or (100 if self.full else 10)))
+        table.add_row("all", str(self.all))
+        table.add_row("max_steps", str(self.max_steps))
         if not self.remote:
             table.add_row("max_concurrent", str(self.max_concurrent))
         if self.group_size > 1:
             table.add_row("group_size", str(self.group_size))
-        # Show auto_respond when it will be true (explicit or via --full)
-        effective_auto_respond = self.auto_respond if self.auto_respond is not None else self.full
-        if effective_auto_respond:
+        if self.auto_respond:
             table.add_row("auto_respond", "[bold green]True[/bold green]")
         if self.very_verbose:
             table.add_row("very_verbose", "[bold green]True[/bold green]")
@@ -642,20 +648,20 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             raise typer.Exit(1)
         hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
         tasks = filtered
-    elif not cfg.full:
-        # Single task mode (no --full, no --task-ids)
+    elif not cfg.all:
+        # Single task mode (no --all, --full, or --task-ids)
         tasks = [tasks[0]]
         hud_console.info("Using first task (run with --full or --task-ids for more)…")
 
     hud_console.info(f"Loaded {len(tasks)} task(s)")
 
     # Prepare agent kwargs
     agent_kwargs = cfg.get_agent_kwargs()
-    auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
+    auto_respond = cfg.auto_respond
     if auto_respond:
         agent_kwargs = {**agent_kwargs, "auto_respond": True}
 
-    max_steps = cfg.max_steps or (100 if cfg.full else 10)
+    max_steps = cfg.max_steps
 
     # Remote execution - submit to HUD platform
     if cfg.remote:
@@ -724,7 +730,12 @@ def eval_command(
         None,
         help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test",  # noqa: E501
     ),
-    full: bool = typer.Option(False, "--full", help="Run entire dataset"),
+    all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset. Shortcut for --all --auto-respond  --max-steps 100",
+    ),
     model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
     config: list[str] | None = typer.Option(  # noqa: B008
         None, "--config", "-c", help="Agent config: key=value"
@@ -743,10 +754,10 @@ def eval_command(
     max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
     verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
     very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
-    auto_respond: bool | None = typer.Option(
-        None,
+    auto_respond: bool = typer.Option(
+        False,
         "--auto-respond",
-        help="Continue without prompting after tool calls (default: True for --full)",
+        help="Automatically prompt the agent to continue if it does not respond with a tool call",
     ),
     group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
     task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
@@ -778,6 +789,7 @@ def eval_command(
         source=source,
         agent=agent,
         model=model,
+        all=all,
         full=full,
         max_concurrent=max_concurrent,
         max_steps=max_steps,

diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
@@ -99,8 +99,8 @@ async def run_dataset(
     ) as ctx:
         # Create agent fresh for each context (ensures correct tool initialization)
         agent = agent_cls.create(**(agent_params or {}))
-        result = await agent.run(ctx, max_steps=max_steps)
-        ctx.reward = result.reward
+        await agent.run(ctx, max_steps=max_steps)
+        # Reward is computed by EvalContext.__aexit__ from evaluate tools
 
     # For parallel execution, results are collected via ctx.results
     if hasattr(ctx, "results") and ctx.results:
@@ -207,6 +207,7 @@ async def run_single_task(
             ctx.metadata.update(metadata)
 
         result = await agent.run(ctx, max_steps=max_steps)
-        ctx.reward = result.reward
+        # Reward is computed by EvalContext.__aexit__ from evaluate tools
 
+    # Return the Trace (ctx.reward is set by EvalContext.__aexit__)
     return result
diff --git a/hud/environment/environment.py b/hud/environment/environment.py
@@ -400,13 +400,20 @@ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolRe
         if self._router.is_local(name):
             # Call tool manager directly to avoid FastMCP context requirement
             result = await self._tool_manager.call_tool(name, arguments)
-            return MCPToolResult(content=result.content, isError=False)
+            return MCPToolResult(
+                content=result.content,
+                structuredContent=result.structured_content,
+            )
 
         connection_name = self._router.get_connection(name)
         if connection_name:
             conn = self._connections[connection_name]
             result = await conn.call_tool(name, arguments)
-            return MCPToolResult(content=result.content, isError=result.isError)
+            return MCPToolResult(
+                content=result.content,
+                isError=result.isError,
+                structuredContent=result.structuredContent,
+            )
 
         raise ValueError(f"Tool not found: {name}")
 

diff --git a/hud/eval/context.py b/hud/eval/context.py
@@ -507,15 +507,10 @@ async def _eval_exit(self, error_message: str | None = None) -> None:
         if not settings.telemetry_enabled or not api_key:
             return
 
-        # Use evaluate tool reward if not manually set
-        reward = self.reward
-        if reward is None:
-            reward = getattr(self, "_evaluate_reward", None)
-
         try:
             payload = EvalExitPayload(
                 **self._build_base_payload().model_dump(),
-                reward=reward,
+                reward=self.reward,
                 success=self.success,
                 error_message=error_message,
             )
@@ -578,9 +573,13 @@ async def __aexit__(
         # Flush any pending telemetry spans for this trace
         flush(self.trace_id)
 
-        # Disconnect environment (parent class)
+        # Disconnect environment (parent class) - also runs evaluate tools
         await super().__aexit__(exc_type, exc_val, exc_tb)
 
+        # Set reward from evaluate tools if not already set
+        if self.reward is None and hasattr(self, "_evaluate_reward"):
+            self.reward = self._evaluate_reward
+
         # Reset context vars
         if self._token is not None:
             _current_trace_headers.reset(self._token)