diff --git a/hud/agents/base.py b/hud/agents/base.py index e9dd15d8..8c3c04d5 100644 --- a/hud/agents/base.py +++ b/hud/agents/base.py @@ -182,7 +182,23 @@ async def run( raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}") if not ctx.prompt: - raise ValueError("ctx.prompt is not set - did the scenario setup run?") + if ctx.has_scenario: + # Scenario was specified but prompt is still empty + # (e.g., scenario returned empty string, or edge case not caught in scenarios.py) + scenario = ctx._task.scenario if ctx._task else "unknown" + raise ValueError( + f"ctx.prompt is not set.\n\n" + f"Scenario '{scenario}' was specified but returned an empty prompt.\n" + f"Check that the scenario's setup function returns a non-empty string." + ) + else: + # No scenario specified at all + raise ValueError( + "ctx.prompt is not set.\n\n" + "No scenario was specified in your task file.\n" + "Either add a 'scenario' field to your task, or set ctx.prompt manually " + "before running the agent." + ) # Store context for tool calls self.ctx = ctx @@ -194,6 +210,11 @@ async def run( try: result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps) + # Propagate error state to context for platform visibility + if result.isError and hasattr(ctx, "error"): + error_msg = result.info.get("error") if result.info else result.content + ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error") + # Submit final answer to context (only if scenario is running) if result.content and ctx.has_scenario: await ctx.submit(result.content) @@ -202,6 +223,9 @@ async def run( except Exception as e: logger.exception("Error while running agent:") + # Propagate error to context for platform visibility + if hasattr(ctx, "error"): + ctx.error = e return Trace( reward=0.0, done=True, diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py index 528f45ef..617db6a0 100644 --- a/hud/agents/misc/response_agent.py +++ b/hud/agents/misc/response_agent.py @@ -1,11 +1,14 @@ from __future__ import annotations +import logging from typing import Literal from openai import AsyncOpenAI from hud.settings import settings +logger = logging.getLogger(__name__) + ResponseType = Literal["STOP", "CONTINUE"] DEFAULT_SYSTEM_PROMPT = """\ @@ -97,5 +100,6 @@ async def determine_response(self, agent_message: str) -> ResponseType: else: return "CONTINUE" - except Exception: + except Exception as e: + logger.warning("Auto-respond failed: %s", e) return "CONTINUE" # Default to continue on error diff --git a/hud/agents/openai_chat.py b/hud/agents/openai_chat.py index e4e61b05..0c44d80e 100644 --- a/hud/agents/openai_chat.py +++ b/hud/agents/openai_chat.py @@ -70,6 +70,18 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) super().__init__(params, **kwargs) self.config: OpenAIChatConfig + if ( + self.config.api_key + and self.config.base_url + and settings.hud_gateway_url in self.config.base_url + and settings.api_key + and self.config.api_key != settings.api_key + ): + raise ValueError( + "OpenAIChatAgent api_key is not allowed with HUD Gateway. " + "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys." + ) + if self.config.openai_client is not None: self.oai = self.config.openai_client elif self.config.api_key is not None or self.config.base_url is not None: diff --git a/hud/agents/tests/test_base.py b/hud/agents/tests/test_base.py index fc2d4706..56565927 100644 --- a/hud/agents/tests/test_base.py +++ b/hud/agents/tests/test_base.py @@ -350,3 +350,67 @@ async def test_get_tool_schemas(self) -> None: assert len(schemas) == 1 assert schemas[0]["name"] == "my_tool" assert schemas[0]["description"] == "My tool description" + + +class TestMCPAgentErrorPropagation: + """Tests for error propagation to EvalContext.""" + + @pytest.mark.asyncio + async def test_exception_propagates_to_ctx_error(self) -> None: + """Test that exceptions during run() set ctx.error for platform visibility.""" + + class FailingAgent(MockMCPAgent): + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + raise RuntimeError("Agent crashed") + + ctx = MockEvalContext(prompt="Do something") + agent = FailingAgent() + + result = await agent.run(ctx) + + # Should return error trace + assert result.isError is True + assert result.content is not None + assert "Agent crashed" in result.content + + assert ctx.error is not None + assert isinstance(ctx.error, BaseException) + assert "Agent crashed" in str(ctx.error) + + @pytest.mark.asyncio + async def test_step_error_propagates_to_ctx_error(self) -> None: + """Test that step-level errors (caught internally) set ctx.error.""" + step_count = [0] + + class FailOnSecondStepAgent(MockMCPAgent): + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + step_count[0] += 1 + if step_count[0] == 1: + return AgentResponse( + content="", + tool_calls=[MCPToolCall(name="test_tool", arguments={})], + done=False, + ) + else: + raise ValueError("Step 2 failed") + + ctx = MockEvalContext(prompt="Do something") + agent = FailOnSecondStepAgent() + + result = await agent.run(ctx) + + # Should return error trace + assert result.isError is True + assert ctx.error is not None + assert "Step 2 failed" in str(ctx.error) + + @pytest.mark.asyncio + async def test_no_error_when_successful(self) -> None: + """Test that ctx.error remains None on successful run.""" + ctx = MockEvalContext(prompt="Do something") + agent = MockMCPAgent() + + result = await agent.run(ctx) + + assert result.isError is False + assert ctx.error is None diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 25b1b593..eb13ce34 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -95,6 +95,7 @@ class AgentPreset: # max_concurrent = 30 # max_steps = 10 # group_size = 1 +# byok = false # Remote only; use encrypted env vars on the platform. # task_ids = ["task_1", "task_2"] # verbose = true # very_verbose = true @@ -158,6 +159,7 @@ class EvalConfig(BaseModel): "verbose", "very_verbose", "group_size", + "byok", "remote", "auto_respond", "quiet", @@ -178,6 +180,7 @@ class EvalConfig(BaseModel): very_verbose: bool = False auto_respond: bool | None = None # Continue without prompting group_size: int = 1 + byok: bool = False remote: bool = False quiet: bool = False # Suppress opening browser for eval links gateway: bool = False # Use HUD Gateway for LLM API calls @@ -208,6 +211,11 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None: def validate_api_keys(self) -> None: """Validate required API keys for the selected agent. Raises typer.Exit on failure.""" + # BYOK requires remote execution (check before agent_type guard) + if self.byok and not self.remote: + hud_console.error("--byok requires --remote (BYOK only works with remote execution)") + raise typer.Exit(1) + if self.agent_type is None: return @@ -284,14 +292,11 @@ def get_agent_kwargs(self) -> dict[str, Any]: if self.model: kwargs["model"] = self.model - if self.agent_type == AgentType.OPENAI_COMPATIBLE: + # For gateway base_url, inject HUD API key if not already set + if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs: base_url = kwargs.get("base_url", "") - if "api_key" not in kwargs: - # Use HUD API key for gateway, otherwise fall back to OpenAI API key - if settings.hud_gateway_url in base_url: - kwargs["api_key"] = settings.api_key - elif settings.openai_api_key: - kwargs["api_key"] = settings.openai_api_key + if settings.hud_gateway_url in base_url and settings.api_key: + kwargs["api_key"] = settings.api_key # Auto-detect Bedrock when Claude is selected with a Bedrock ARN # Check both model and checkpoint_name for ARN patterns @@ -565,6 +570,8 @@ def display(self) -> None: table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)") if self.gateway: table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)") + if self.byok: + table.add_row("byok", "[bold green]True[/bold green] (remote only)") # Tool filters (only if set) if self.allowed_tools: @@ -665,6 +672,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: # Remote execution - submit to HUD platform if cfg.remote: + agent_kwargs = { + k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client") + } # Create a job ID for tracking import uuid @@ -682,6 +692,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: agent_params=agent_kwargs, max_steps=max_steps, group_size=cfg.group_size, + use_byok=cfg.byok, ) hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}") @@ -765,6 +776,11 @@ def eval_command( remote: bool = typer.Option( False, "--remote", help="Submit tasks to platform for remote execution" ), + byok: bool = typer.Option( + False, + "--byok", + help="Remote only: use BYOK keys from encrypted env vars for inference", + ), quiet: bool = typer.Option( False, "--quiet", "-q", help="Suppress opening browser for eval links" ), @@ -802,6 +818,7 @@ def eval_command( group_size=group_size, config=config, remote=remote, + byok=byok, quiet=quiet, gateway=gateway, ) diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py index fabcbfa9..72f0ec0a 100644 --- a/hud/datasets/utils.py +++ b/hud/datasets/utils.py @@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel): description="Additional metadata to inject into the trace context.", ) trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.") + use_byok: bool = Field( + default=False, + description="If True, use BYOK headers from encrypted env vars for inference.", + ) @model_validator(mode="after") def _validate_task(self) -> SingleTaskRequest: @@ -110,6 +114,7 @@ async def submit_rollouts( group_size: int = 1, batch_size: int = 50, metadata: dict[str, Any] | None = None, + use_byok: bool = False, ) -> None: """Submit rollouts to the HUD platform API for remote execution (fire-and-forget). @@ -122,6 +127,7 @@ async def submit_rollouts( group_size: Number of rollouts per task (for variance estimation) batch_size: Number of rollouts per API batch request metadata: Additional metadata for each rollout + use_byok: If True, use BYOK keys from encrypted env vars (remote only) """ from hud.eval.utils import is_v4_format @@ -168,6 +174,7 @@ async def submit_rollouts( trace_name=trace_name, group_id=base_task_id if group_size > 1 else None, metadata=metadata or {}, + use_byok=use_byok, ) ) diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py index 1369ea37..dd56373e 100644 --- a/hud/environment/scenarios.py +++ b/hud/environment/scenarios.py @@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id) try: result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined] - if result.messages: - first_msg = result.messages[0] - content = first_msg.content - if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr] - return content.text # type: ignore[union-attr] - elif isinstance(content, str): - return content except Exception as e: - logger.warning("Failed to get scenario prompt: %s", e) - return None + # Fetch available scenarios for error context + try: + prompts = await self.list_prompts() # type: ignore[attr-defined] + scenario_prompts = [p.name for p in prompts if ":" in p.name] + available = ( + "\n ".join(scenario_prompts) if scenario_prompts else "(none found)" + ) + except Exception: + available = "(could not fetch available scenarios)" + + raise ValueError( + f"Scenario not found.\n\n" + f"Scenario IDs have the format 'environment_name:scenario_name'.\n" + f"If you only specify 'scenario_name', the SDK uses your task's env name " + f"as the prefix.\n" + f"This won't work if the HUD environment was declared with a different name." + f"\n\n" + f" You requested: {scenario_name}\n" + f" SDK looked for: {prompt_id}\n\n" + f"Available scenarios:\n {available}\n\n" + f"Fix: Use one of the scenario IDs above in your task JSON." + ) from e + + # Validate the response (outside try/except so errors aren't wrapped) + if result.messages: + first_msg = result.messages[0] + content = first_msg.content + if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr] + return content.text # type: ignore[union-attr] + elif isinstance(content, str): + return content + else: + # Content exists but is neither text object nor string + raise ValueError( + f"Scenario '{scenario_name}' returned malformed content.\n\n" + f"Expected: content with .text attribute (str) or content as str\n" + f"Got: {type(content).__name__}\n\n" + f"Check that the scenario's setup function returns a valid prompt." + ) + else: + # get_prompt succeeded but returned empty messages + raise ValueError( + f"Scenario '{scenario_name}' returned an empty response.\n\n" + f"The scenario's setup function was called but returned no messages.\n" + f"Check that the scenario returns a valid prompt string." + ) async def run_scenario_evaluate(self, scenario_name: str) -> float | None: """Run a scenario's evaluate phase and return the reward. diff --git a/hud/eval/context.py b/hud/eval/context.py index 20b8b943..77bb752d 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -539,12 +539,24 @@ async def __aenter__(self) -> Self: # Connect environment (MCP servers, tools) await super().__aenter__() - # Run task scenario setup (if created from_task with scenario) - await self._run_task_scenario_setup() - - # Notify backend and print link - await self._eval_enter() - self._print_eval_link() + try: + # Run task scenario setup (if created from_task with scenario) + await self._run_task_scenario_setup() + + # Notify backend and print link + await self._eval_enter() + self._print_eval_link() + except BaseException: + # Cleanup if setup fails - __aexit__ won't be called automatically + await super().__aexit__(None, None, None) + # Reset context vars + if self._token is not None: + _current_trace_headers.reset(self._token) + self._token = None + if self._api_key_token is not None: + _current_api_key.reset(self._api_key_token) + self._api_key_token = None + raise return self diff --git a/hud/eval/instrument.py b/hud/eval/instrument.py index e950522c..94598f1f 100644 --- a/hud/eval/instrument.py +++ b/hud/eval/instrument.py @@ -1,6 +1,6 @@ -"""Auto-instrumentation for httpx to inject trace headers. +"""Auto-instrumentation for httpx and aiohttp to inject trace headers. -This module patches httpx clients to automatically add: +This module patches HTTP clients to automatically add: - Trace-Id headers when inside an eval context - Authorization headers for HUD API calls """ @@ -8,9 +8,12 @@ from __future__ import annotations import logging -from typing import Any +from typing import TYPE_CHECKING, Any from urllib.parse import urlparse +if TYPE_CHECKING: + from types import SimpleNamespace + from hud.settings import settings logger = logging.getLogger(__name__) @@ -70,7 +73,7 @@ async def _async_httpx_request_hook(request: Any) -> None: _httpx_request_hook(request) -def _instrument_client(client: Any) -> None: +def _instrument_httpx_client(client: Any) -> None: """Add trace hook to an httpx client instance.""" is_async = hasattr(client, "aclose") hook = _async_httpx_request_hook if is_async else _httpx_request_hook @@ -93,7 +96,7 @@ def _patch_httpx() -> None: def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None: _original_async_init(self, *args, **kwargs) - _instrument_client(self) + _instrument_httpx_client(self) httpx.AsyncClient.__init__ = _patched_async_init # type: ignore[method-assign] @@ -101,15 +104,65 @@ def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None: def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None: _original_sync_init(self, *args, **kwargs) - _instrument_client(self) + _instrument_httpx_client(self) httpx.Client.__init__ = _patched_sync_init # type: ignore[method-assign] logger.debug("httpx auto-instrumentation enabled") -# Auto-patch httpx on module import +def _patch_aiohttp() -> None: + """ + Monkey-patch aiohttp to auto-instrument all ClientSession instances. + This is important for the Gemini client in particular, which uses aiohttp by default. + """ + try: + import aiohttp + except ImportError: + logger.debug("aiohttp not installed, skipping auto-instrumentation") + return + + async def on_request_start( + _session: aiohttp.ClientSession, + _trace_config_ctx: SimpleNamespace, + params: aiohttp.TraceRequestStartParams, + ) -> None: + """aiohttp trace hook that adds trace headers and auth to HUD requests.""" + url_str = str(params.url) + if not _is_hud_url(url_str): + return + + trace_headers = _get_trace_headers() + if trace_headers is not None: + for key, value in trace_headers.items(): + params.headers[key] = value + logger.debug("Added trace headers to aiohttp request: %s", url_str) + + has_auth = "authorization" in {k.lower() for k in params.headers} + if not has_auth and settings.api_key: + params.headers["Authorization"] = f"Bearer {settings.api_key}" + logger.debug("Added API key auth to aiohttp request: %s", url_str) + + trace_config = aiohttp.TraceConfig() + trace_config.on_request_start.append(on_request_start) + + _original_init = aiohttp.ClientSession.__init__ + + def _patched_init(self: aiohttp.ClientSession, *args: Any, **kwargs: Any) -> None: + existing_traces = kwargs.get("trace_configs") or [] + if trace_config not in existing_traces: + existing_traces = [*list(existing_traces), trace_config] + kwargs["trace_configs"] = existing_traces + _original_init(self, *args, **kwargs) + + aiohttp.ClientSession.__init__ = _patched_init # type: ignore[method-assign] + + logger.debug("aiohttp auto-instrumentation enabled") + + +# Auto-patch on module import _patch_httpx() +_patch_aiohttp() -__all__ = ["_patch_httpx"] +__all__ = ["_patch_aiohttp", "_patch_httpx"] diff --git a/hud/utils/hud_console.py b/hud/utils/hud_console.py index d28e87ce..e7fa1f14 100644 --- a/hud/utils/hud_console.py +++ b/hud/utils/hud_console.py @@ -20,9 +20,6 @@ import traceback from typing import TYPE_CHECKING, Any, Literal, Self -import questionary -import typer -from questionary import Style from rich.console import Console from rich.panel import Panel from rich.table import Table @@ -488,6 +485,9 @@ def select( Returns: The selected choice value """ + import questionary + from questionary import Style + # Convert choices to questionary format q_choices = [] @@ -518,6 +518,8 @@ def select( # If no selection made (Ctrl+C or ESC), exit if result is None: + import typer + raise typer.Exit(1) return result @@ -573,6 +575,8 @@ def confirm(self, message: str, default: bool = True) -> bool: message: The confirmation message default: If True, the default choice is True """ + import questionary + return questionary.confirm(message, default=default).ask() # Symbol-based output methods