Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion hud/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,23 @@ async def run(
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")

if not ctx.prompt:
raise ValueError("ctx.prompt is not set - did the scenario setup run?")
if ctx.has_scenario:
# Scenario was specified but prompt is still empty
# (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
scenario = ctx._task.scenario if ctx._task else "unknown"
raise ValueError(
f"ctx.prompt is not set.\n\n"
f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
f"Check that the scenario's setup function returns a non-empty string."
)
else:
# No scenario specified at all
raise ValueError(
"ctx.prompt is not set.\n\n"
"No scenario was specified in your task file.\n"
"Either add a 'scenario' field to your task, or set ctx.prompt manually "
"before running the agent."
)

# Store context for tool calls
self.ctx = ctx
Expand All @@ -194,6 +210,11 @@ async def run(
try:
result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)

# Propagate error state to context for platform visibility
if result.isError and hasattr(ctx, "error"):
error_msg = result.info.get("error") if result.info else result.content
ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")

# Submit final answer to context (only if scenario is running)
if result.content and ctx.has_scenario:
await ctx.submit(result.content)
Expand All @@ -202,6 +223,9 @@ async def run(

except Exception as e:
logger.exception("Error while running agent:")
# Propagate error to context for platform visibility
if hasattr(ctx, "error"):
ctx.error = e
return Trace(
reward=0.0,
done=True,
Expand Down
6 changes: 5 additions & 1 deletion hud/agents/misc/response_agent.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from __future__ import annotations

import logging
from typing import Literal

from openai import AsyncOpenAI

from hud.settings import settings

logger = logging.getLogger(__name__)

ResponseType = Literal["STOP", "CONTINUE"]

DEFAULT_SYSTEM_PROMPT = """\
Expand Down Expand Up @@ -97,5 +100,6 @@ async def determine_response(self, agent_message: str) -> ResponseType:
else:
return "CONTINUE"

except Exception:
except Exception as e:
logger.warning("Auto-respond failed: %s", e)
return "CONTINUE" # Default to continue on error
12 changes: 12 additions & 0 deletions hud/agents/openai_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any)
super().__init__(params, **kwargs)
self.config: OpenAIChatConfig

if (
self.config.api_key
and self.config.base_url
and settings.hud_gateway_url in self.config.base_url
and settings.api_key
and self.config.api_key != settings.api_key
):
raise ValueError(
"OpenAIChatAgent api_key is not allowed with HUD Gateway. "
"Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
)

if self.config.openai_client is not None:
self.oai = self.config.openai_client
elif self.config.api_key is not None or self.config.base_url is not None:
Expand Down
64 changes: 64 additions & 0 deletions hud/agents/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,67 @@ async def test_get_tool_schemas(self) -> None:
assert len(schemas) == 1
assert schemas[0]["name"] == "my_tool"
assert schemas[0]["description"] == "My tool description"


class TestMCPAgentErrorPropagation:
"""Tests for error propagation to EvalContext."""

@pytest.mark.asyncio
async def test_exception_propagates_to_ctx_error(self) -> None:
"""Test that exceptions during run() set ctx.error for platform visibility."""

class FailingAgent(MockMCPAgent):
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
raise RuntimeError("Agent crashed")

ctx = MockEvalContext(prompt="Do something")
agent = FailingAgent()

result = await agent.run(ctx)

# Should return error trace
assert result.isError is True
assert result.content is not None
assert "Agent crashed" in result.content

assert ctx.error is not None
assert isinstance(ctx.error, BaseException)
assert "Agent crashed" in str(ctx.error)

@pytest.mark.asyncio
async def test_step_error_propagates_to_ctx_error(self) -> None:
"""Test that step-level errors (caught internally) set ctx.error."""
step_count = [0]

class FailOnSecondStepAgent(MockMCPAgent):
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
step_count[0] += 1
if step_count[0] == 1:
return AgentResponse(
content="",
tool_calls=[MCPToolCall(name="test_tool", arguments={})],
done=False,
)
else:
raise ValueError("Step 2 failed")

ctx = MockEvalContext(prompt="Do something")
agent = FailOnSecondStepAgent()

result = await agent.run(ctx)

# Should return error trace
assert result.isError is True
assert ctx.error is not None
assert "Step 2 failed" in str(ctx.error)

@pytest.mark.asyncio
async def test_no_error_when_successful(self) -> None:
"""Test that ctx.error remains None on successful run."""
ctx = MockEvalContext(prompt="Do something")
agent = MockMCPAgent()

result = await agent.run(ctx)

assert result.isError is False
assert ctx.error is None
31 changes: 24 additions & 7 deletions hud/cli/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class AgentPreset:
# max_concurrent = 30
# max_steps = 10
# group_size = 1
# byok = false # Remote only; use encrypted env vars on the platform.
# task_ids = ["task_1", "task_2"]
# verbose = true
# very_verbose = true
Expand Down Expand Up @@ -158,6 +159,7 @@ class EvalConfig(BaseModel):
"verbose",
"very_verbose",
"group_size",
"byok",
"remote",
"auto_respond",
"quiet",
Expand All @@ -178,6 +180,7 @@ class EvalConfig(BaseModel):
very_verbose: bool = False
auto_respond: bool | None = None # Continue without prompting
group_size: int = 1
byok: bool = False
remote: bool = False
quiet: bool = False # Suppress opening browser for eval links
gateway: bool = False # Use HUD Gateway for LLM API calls
Expand Down Expand Up @@ -208,6 +211,11 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:

def validate_api_keys(self) -> None:
"""Validate required API keys for the selected agent. Raises typer.Exit on failure."""
# BYOK requires remote execution (check before agent_type guard)
if self.byok and not self.remote:
hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
raise typer.Exit(1)

if self.agent_type is None:
return

Expand Down Expand Up @@ -284,14 +292,11 @@ def get_agent_kwargs(self) -> dict[str, Any]:
if self.model:
kwargs["model"] = self.model

if self.agent_type == AgentType.OPENAI_COMPATIBLE:
# For gateway base_url, inject HUD API key if not already set
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
base_url = kwargs.get("base_url", "")
if "api_key" not in kwargs:
# Use HUD API key for gateway, otherwise fall back to OpenAI API key
if settings.hud_gateway_url in base_url:
kwargs["api_key"] = settings.api_key
elif settings.openai_api_key:
kwargs["api_key"] = settings.openai_api_key
if settings.hud_gateway_url in base_url and settings.api_key:
kwargs["api_key"] = settings.api_key

# Auto-detect Bedrock when Claude is selected with a Bedrock ARN
# Check both model and checkpoint_name for ARN patterns
Expand Down Expand Up @@ -565,6 +570,8 @@ def display(self) -> None:
table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
if self.gateway:
table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
if self.byok:
table.add_row("byok", "[bold green]True[/bold green] (remote only)")

# Tool filters (only if set)
if self.allowed_tools:
Expand Down Expand Up @@ -665,6 +672,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:

# Remote execution - submit to HUD platform
if cfg.remote:
agent_kwargs = {
k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
}
# Create a job ID for tracking
import uuid

Expand All @@ -682,6 +692,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
agent_params=agent_kwargs,
max_steps=max_steps,
group_size=cfg.group_size,
use_byok=cfg.byok,
)

hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
Expand Down Expand Up @@ -765,6 +776,11 @@ def eval_command(
remote: bool = typer.Option(
False, "--remote", help="Submit tasks to platform for remote execution"
),
byok: bool = typer.Option(
False,
"--byok",
help="Remote only: use BYOK keys from encrypted env vars for inference",
),
quiet: bool = typer.Option(
False, "--quiet", "-q", help="Suppress opening browser for eval links"
),
Expand Down Expand Up @@ -802,6 +818,7 @@ def eval_command(
group_size=group_size,
config=config,
remote=remote,
byok=byok,
quiet=quiet,
gateway=gateway,
)
Expand Down
7 changes: 7 additions & 0 deletions hud/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
description="Additional metadata to inject into the trace context.",
)
trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
use_byok: bool = Field(
default=False,
description="If True, use BYOK headers from encrypted env vars for inference.",
)

@model_validator(mode="after")
def _validate_task(self) -> SingleTaskRequest:
Expand Down Expand Up @@ -110,6 +114,7 @@ async def submit_rollouts(
group_size: int = 1,
batch_size: int = 50,
metadata: dict[str, Any] | None = None,
use_byok: bool = False,
) -> None:
"""Submit rollouts to the HUD platform API for remote execution (fire-and-forget).

Expand All @@ -122,6 +127,7 @@ async def submit_rollouts(
group_size: Number of rollouts per task (for variance estimation)
batch_size: Number of rollouts per API batch request
metadata: Additional metadata for each rollout
use_byok: If True, use BYOK keys from encrypted env vars (remote only)
"""
from hud.eval.utils import is_v4_format

Expand Down Expand Up @@ -168,6 +174,7 @@ async def submit_rollouts(
trace_name=trace_name,
group_id=base_task_id if group_size > 1 else None,
metadata=metadata or {},
use_byok=use_byok,
)
)

Expand Down
55 changes: 46 additions & 9 deletions hud/environment/scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
try:
result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined]
if result.messages:
first_msg = result.messages[0]
content = first_msg.content
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
return content.text # type: ignore[union-attr]
elif isinstance(content, str):
return content
except Exception as e:
logger.warning("Failed to get scenario prompt: %s", e)
return None
# Fetch available scenarios for error context
try:
prompts = await self.list_prompts() # type: ignore[attr-defined]
scenario_prompts = [p.name for p in prompts if ":" in p.name]
available = (
"\n ".join(scenario_prompts) if scenario_prompts else "(none found)"
)
except Exception:
available = "(could not fetch available scenarios)"

raise ValueError(
f"Scenario not found.\n\n"
f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
f"If you only specify 'scenario_name', the SDK uses your task's env name "
f"as the prefix.\n"
f"This won't work if the HUD environment was declared with a different name."
f"\n\n"
f" You requested: {scenario_name}\n"
f" SDK looked for: {prompt_id}\n\n"
f"Available scenarios:\n {available}\n\n"
f"Fix: Use one of the scenario IDs above in your task JSON."
) from e

# Validate the response (outside try/except so errors aren't wrapped)
if result.messages:
first_msg = result.messages[0]
content = first_msg.content
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
return content.text # type: ignore[union-attr]
elif isinstance(content, str):
return content
else:
# Content exists but is neither text object nor string
raise ValueError(
f"Scenario '{scenario_name}' returned malformed content.\n\n"
f"Expected: content with .text attribute (str) or content as str\n"
f"Got: {type(content).__name__}\n\n"
f"Check that the scenario's setup function returns a valid prompt."
)
else:
# get_prompt succeeded but returned empty messages
raise ValueError(
f"Scenario '{scenario_name}' returned an empty response.\n\n"
f"The scenario's setup function was called but returned no messages.\n"
f"Check that the scenario returns a valid prompt string."
)

async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
"""Run a scenario's evaluate phase and return the reward.
Expand Down
24 changes: 18 additions & 6 deletions hud/eval/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,12 +539,24 @@ async def __aenter__(self) -> Self:
# Connect environment (MCP servers, tools)
await super().__aenter__()

# Run task scenario setup (if created from_task with scenario)
await self._run_task_scenario_setup()

# Notify backend and print link
await self._eval_enter()
self._print_eval_link()
try:
# Run task scenario setup (if created from_task with scenario)
await self._run_task_scenario_setup()

# Notify backend and print link
await self._eval_enter()
self._print_eval_link()
except BaseException:
# Cleanup if setup fails - __aexit__ won't be called automatically
await super().__aexit__(None, None, None)
# Reset context vars
if self._token is not None:
_current_trace_headers.reset(self._token)
self._token = None
if self._api_key_token is not None:
_current_api_key.reset(self._api_key_token)
self._api_key_token = None
raise

return self

Expand Down
Loading
Loading