Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hud/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ def find_reward(result: MCPToolResult) -> float:
except json.JSONDecodeError:
pass

logger.error("Couldn't parse reward from result: %s", result)
logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
return 0.0


Expand Down
50 changes: 31 additions & 19 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class AgentPreset:
[eval]
# source = "hud-evals/SheetBench-50"
# agent = "claude"
# full = false
# all = false # Run all problems instead of just 1
# max_concurrent = 30
# max_steps = 10
# group_size = 1
Expand Down Expand Up @@ -152,7 +152,7 @@ class EvalConfig(BaseModel):
"source",
"agent_type",
"task_ids",
"full",
"all",
"max_concurrent",
"max_steps",
"verbose",
Expand All @@ -171,12 +171,12 @@ class EvalConfig(BaseModel):
agent_type: AgentType | None = None
model: str | None = None
task_ids: list[str] | None = None
full: bool = False
all: bool = False # Run all problems instead of just 1
max_concurrent: int = 30
max_steps: int | None = None
max_steps: int = 10
verbose: bool = False
very_verbose: bool = False
auto_respond: bool | None = None # Continue without prompting (default: True for --full)
auto_respond: bool | None = None # Continue without prompting
group_size: int = 1
remote: bool = False
quiet: bool = False # Suppress opening browser for eval links
Expand Down Expand Up @@ -454,12 +454,20 @@ def merge_cli(

overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})

for k in ("full", "verbose", "very_verbose", "remote", "quiet", "gateway"):
for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
if cli_args.get(k) is True:
overrides[k] = True
elif k in overrides and cli_args.get(k) is False:
del overrides[k]

# --full is a shortcut for --all --auto-respond --max-steps 100
if overrides.get("full"):
overrides["all"] = True
if "auto_respond" not in overrides:
overrides["auto_respond"] = True
if "max_steps" not in overrides:
overrides["max_steps"] = 100

if config:
merged_agent_config = dict(self.agent_config)
for item in config:
Expand Down Expand Up @@ -541,15 +549,13 @@ def display(self) -> None:
table.add_row(
"task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
)
table.add_row("full", str(self.full))
table.add_row("max_steps", str(self.max_steps or (100 if self.full else 10)))
table.add_row("all", str(self.all))
table.add_row("max_steps", str(self.max_steps))
if not self.remote:
table.add_row("max_concurrent", str(self.max_concurrent))
if self.group_size > 1:
table.add_row("group_size", str(self.group_size))
# Show auto_respond when it will be true (explicit or via --full)
effective_auto_respond = self.auto_respond if self.auto_respond is not None else self.full
if effective_auto_respond:
if self.auto_respond:
table.add_row("auto_respond", "[bold green]True[/bold green]")
if self.very_verbose:
table.add_row("very_verbose", "[bold green]True[/bold green]")
Expand Down Expand Up @@ -642,20 +648,20 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
raise typer.Exit(1)
hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
tasks = filtered
elif not cfg.full:
# Single task mode (no --full, no --task-ids)
elif not cfg.all:
# Single task mode (no --all, --full, or --task-ids)
tasks = [tasks[0]]
hud_console.info("Using first task (run with --full or --task-ids for more)…")

hud_console.info(f"Loaded {len(tasks)} task(s)")

# Prepare agent kwargs
agent_kwargs = cfg.get_agent_kwargs()
auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
auto_respond = cfg.auto_respond
if auto_respond:
agent_kwargs = {**agent_kwargs, "auto_respond": True}

max_steps = cfg.max_steps or (100 if cfg.full else 10)
max_steps = cfg.max_steps

# Remote execution - submit to HUD platform
if cfg.remote:
Expand Down Expand Up @@ -724,7 +730,12 @@ def eval_command(
None,
help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
),
full: bool = typer.Option(False, "--full", help="Run entire dataset"),
all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
full: bool = typer.Option(
False,
"--full",
help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
),
model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
config: list[str] | None = typer.Option( # noqa: B008
None, "--config", "-c", help="Agent config: key=value"
Expand All @@ -743,10 +754,10 @@ def eval_command(
max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
auto_respond: bool | None = typer.Option(
None,
auto_respond: bool = typer.Option(
False,
"--auto-respond",
help="Continue without prompting after tool calls (default: True for --full)",
help="Automatically prompt the agent to continue if it does not respond with a tool call",
),
group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
Expand Down Expand Up @@ -778,6 +789,7 @@ def eval_command(
source=source,
agent=agent,
model=model,
all=all,
full=full,
max_concurrent=max_concurrent,
max_steps=max_steps,
Expand Down
7 changes: 4 additions & 3 deletions hud/datasets/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ async def run_dataset(
) as ctx:
# Create agent fresh for each context (ensures correct tool initialization)
agent = agent_cls.create(**(agent_params or {}))
result = await agent.run(ctx, max_steps=max_steps)
ctx.reward = result.reward
await agent.run(ctx, max_steps=max_steps)
# Reward is computed by EvalContext.__aexit__ from evaluate tools

# For parallel execution, results are collected via ctx.results
if hasattr(ctx, "results") and ctx.results:
Expand Down Expand Up @@ -207,6 +207,7 @@ async def run_single_task(
ctx.metadata.update(metadata)

result = await agent.run(ctx, max_steps=max_steps)
ctx.reward = result.reward
# Reward is computed by EvalContext.__aexit__ from evaluate tools

# Return the Trace (ctx.reward is set by EvalContext.__aexit__)
return result
11 changes: 9 additions & 2 deletions hud/environment/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,13 +400,20 @@ async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolRe
if self._router.is_local(name):
# Call tool manager directly to avoid FastMCP context requirement
result = await self._tool_manager.call_tool(name, arguments)
return MCPToolResult(content=result.content, isError=False)
return MCPToolResult(
content=result.content,
structuredContent=result.structured_content,
)

connection_name = self._router.get_connection(name)
if connection_name:
conn = self._connections[connection_name]
result = await conn.call_tool(name, arguments)
return MCPToolResult(content=result.content, isError=result.isError)
return MCPToolResult(
content=result.content,
isError=result.isError,
structuredContent=result.structuredContent,
)

raise ValueError(f"Tool not found: {name}")

Expand Down
13 changes: 6 additions & 7 deletions hud/eval/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,15 +507,10 @@ async def _eval_exit(self, error_message: str | None = None) -> None:
if not settings.telemetry_enabled or not api_key:
return

# Use evaluate tool reward if not manually set
reward = self.reward
if reward is None:
reward = getattr(self, "_evaluate_reward", None)

try:
payload = EvalExitPayload(
**self._build_base_payload().model_dump(),
reward=reward,
reward=self.reward,
success=self.success,
error_message=error_message,
)
Expand Down Expand Up @@ -578,9 +573,13 @@ async def __aexit__(
# Flush any pending telemetry spans for this trace
flush(self.trace_id)

# Disconnect environment (parent class)
# Disconnect environment (parent class) - also runs evaluate tools
await super().__aexit__(exc_type, exc_val, exc_tb)

# Set reward from evaluate tools if not already set
if self.reward is None and hasattr(self, "_evaluate_reward"):
self.reward = self._evaluate_reward

# Reset context vars
if self._token is not None:
_current_trace_headers.reset(self._token)
Expand Down
Loading