hud-evals · lorenss-m · Dec 17, 2025 · Dec 17, 2025 · Dec 21, 2025 · Dec 21, 2025
diff --git a/hud/agents/base.py b/hud/agents/base.py
@@ -182,7 +182,23 @@ async def run(
             raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
 
         if not ctx.prompt:
-            raise ValueError("ctx.prompt is not set - did the scenario setup run?")
+            if ctx.has_scenario:
+                # Scenario was specified but prompt is still empty
+                # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
+                scenario = ctx._task.scenario if ctx._task else "unknown"
+                raise ValueError(
+                    f"ctx.prompt is not set.\n\n"
+                    f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
+                    f"Check that the scenario's setup function returns a non-empty string."
+                )
+            else:
+                # No scenario specified at all
+                raise ValueError(
+                    "ctx.prompt is not set.\n\n"
+                    "No scenario was specified in your task file.\n"
+                    "Either add a 'scenario' field to your task, or set ctx.prompt manually "
+                    "before running the agent."
+                )
 
         # Store context for tool calls
         self.ctx = ctx
@@ -194,6 +210,11 @@ async def run(
         try:
             result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
 
+            # Propagate error state to context for platform visibility
+            if result.isError and hasattr(ctx, "error"):
+                error_msg = result.info.get("error") if result.info else result.content
+                ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
+
             # Submit final answer to context (only if scenario is running)
             if result.content and ctx.has_scenario:
                 await ctx.submit(result.content)
@@ -202,6 +223,9 @@ async def run(
 
         except Exception as e:
             logger.exception("Error while running agent:")
+            # Propagate error to context for platform visibility
+            if hasattr(ctx, "error"):
+                ctx.error = e
             return Trace(
                 reward=0.0,
                 done=True,

diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+import logging
 from typing import Literal
 
 from openai import AsyncOpenAI
 
 from hud.settings import settings
 
+logger = logging.getLogger(__name__)
+
 ResponseType = Literal["STOP", "CONTINUE"]
 
 DEFAULT_SYSTEM_PROMPT = """\
@@ -97,5 +100,6 @@ async def determine_response(self, agent_message: str) -> ResponseType:
             else:
                 return "CONTINUE"
 
-        except Exception:
+        except Exception as e:
+            logger.warning("Auto-respond failed: %s", e)
             return "CONTINUE"  # Default to continue on error
diff --git a/hud/agents/openai_chat.py b/hud/agents/openai_chat.py
@@ -70,6 +70,18 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any)
         super().__init__(params, **kwargs)
         self.config: OpenAIChatConfig
 
+        if (
+            self.config.api_key
+            and self.config.base_url
+            and settings.hud_gateway_url in self.config.base_url
+            and settings.api_key
+            and self.config.api_key != settings.api_key
+        ):
+            raise ValueError(
+                "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
+                "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
+            )
+
         if self.config.openai_client is not None:
             self.oai = self.config.openai_client
         elif self.config.api_key is not None or self.config.base_url is not None:

diff --git a/hud/agents/tests/test_base.py b/hud/agents/tests/test_base.py
@@ -350,3 +350,67 @@ async def test_get_tool_schemas(self) -> None:
         assert len(schemas) == 1
         assert schemas[0]["name"] == "my_tool"
         assert schemas[0]["description"] == "My tool description"
+
+
+class TestMCPAgentErrorPropagation:
+    """Tests for error propagation to EvalContext."""
+
+    @pytest.mark.asyncio
+    async def test_exception_propagates_to_ctx_error(self) -> None:
+        """Test that exceptions during run() set ctx.error for platform visibility."""
+
+        class FailingAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                raise RuntimeError("Agent crashed")
+
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailingAgent()
+
+        result = await agent.run(ctx)
+
+        # Should return error trace
+        assert result.isError is True
+        assert result.content is not None
+        assert "Agent crashed" in result.content
+
+        assert ctx.error is not None
+        assert isinstance(ctx.error, BaseException)
+        assert "Agent crashed" in str(ctx.error)
+
+    @pytest.mark.asyncio
+    async def test_step_error_propagates_to_ctx_error(self) -> None:
+        """Test that step-level errors (caught internally) set ctx.error."""
+        step_count = [0]
+
+        class FailOnSecondStepAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                step_count[0] += 1
+                if step_count[0] == 1:
+                    return AgentResponse(
+                        content="",
+                        tool_calls=[MCPToolCall(name="test_tool", arguments={})],
+                        done=False,
+                    )
+                else:
+                    raise ValueError("Step 2 failed")
+
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailOnSecondStepAgent()
+
+        result = await agent.run(ctx)
+
+        # Should return error trace
+        assert result.isError is True
+        assert ctx.error is not None
+        assert "Step 2 failed" in str(ctx.error)
+
+    @pytest.mark.asyncio
+    async def test_no_error_when_successful(self) -> None:
+        """Test that ctx.error remains None on successful run."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
+
+        result = await agent.run(ctx)
+
+        assert result.isError is False
+        assert ctx.error is None
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
@@ -95,6 +95,7 @@ class AgentPreset:
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
+# byok = false  # Remote only; use encrypted env vars on the platform.
 # task_ids = ["task_1", "task_2"]
 # verbose = true
 # very_verbose = true
@@ -158,6 +159,7 @@ class EvalConfig(BaseModel):
         "verbose",
         "very_verbose",
         "group_size",
+        "byok",
         "remote",
         "auto_respond",
         "quiet",
@@ -178,6 +180,7 @@ class EvalConfig(BaseModel):
     very_verbose: bool = False
     auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
+    byok: bool = False
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
@@ -208,6 +211,11 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
 
     def validate_api_keys(self) -> None:
         """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
+        # BYOK requires remote execution (check before agent_type guard)
+        if self.byok and not self.remote:
+            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
+            raise typer.Exit(1)
+
         if self.agent_type is None:
             return
 
@@ -284,14 +292,11 @@ def get_agent_kwargs(self) -> dict[str, Any]:
         if self.model:
             kwargs["model"] = self.model
 
-        if self.agent_type == AgentType.OPENAI_COMPATIBLE:
+        # For gateway base_url, inject HUD API key if not already set
+        if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
             base_url = kwargs.get("base_url", "")
-            if "api_key" not in kwargs:
-                # Use HUD API key for gateway, otherwise fall back to OpenAI API key
-                if settings.hud_gateway_url in base_url:
-                    kwargs["api_key"] = settings.api_key
-                elif settings.openai_api_key:
-                    kwargs["api_key"] = settings.openai_api_key
+            if settings.hud_gateway_url in base_url and settings.api_key:
+                kwargs["api_key"] = settings.api_key
 
         # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
         # Check both model and checkpoint_name for ARN patterns
@@ -565,6 +570,8 @@ def display(self) -> None:
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
         if self.gateway:
             table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
+        if self.byok:
+            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
 
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -665,6 +672,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
     # Remote execution - submit to HUD platform
     if cfg.remote:
+        agent_kwargs = {
+            k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
+        }
         # Create a job ID for tracking
         import uuid
 
@@ -682,6 +692,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             agent_params=agent_kwargs,
             max_steps=max_steps,
             group_size=cfg.group_size,
+            use_byok=cfg.byok,
         )
 
         hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
@@ -765,6 +776,11 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
+    byok: bool = typer.Option(
+        False,
+        "--byok",
+        help="Remote only: use BYOK keys from encrypted env vars for inference",
+    ),
     quiet: bool = typer.Option(
         False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
@@ -802,6 +818,7 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
+        byok=byok,
         quiet=quiet,
         gateway=gateway,
     )

diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py
@@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
         description="Additional metadata to inject into the trace context.",
     )
     trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
+    use_byok: bool = Field(
+        default=False,
+        description="If True, use BYOK headers from encrypted env vars for inference.",
+    )
 
     @model_validator(mode="after")
     def _validate_task(self) -> SingleTaskRequest:
@@ -110,6 +114,7 @@ async def submit_rollouts(
     group_size: int = 1,
     batch_size: int = 50,
     metadata: dict[str, Any] | None = None,
+    use_byok: bool = False,
 ) -> None:
     """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
 
@@ -122,6 +127,7 @@ async def submit_rollouts(
         group_size: Number of rollouts per task (for variance estimation)
         batch_size: Number of rollouts per API batch request
         metadata: Additional metadata for each rollout
+        use_byok: If True, use BYOK keys from encrypted env vars (remote only)
     """
     from hud.eval.utils import is_v4_format
 
@@ -168,6 +174,7 @@ async def submit_rollouts(
                     trace_name=trace_name,
                     group_id=base_task_id if group_size > 1 else None,
                     metadata=metadata or {},
+                    use_byok=use_byok,
                 )
             )
 

diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
@@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
                 logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
             try:
                 result = await self.get_prompt(prompt_id, args)  # type: ignore[attr-defined]
-                if result.messages:
-                    first_msg = result.messages[0]
-                    content = first_msg.content
-                    if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
-                        return content.text  # type: ignore[union-attr]
-                    elif isinstance(content, str):
-                        return content
             except Exception as e:
-                logger.warning("Failed to get scenario prompt: %s", e)
-            return None
+                # Fetch available scenarios for error context
+                try:
+                    prompts = await self.list_prompts()  # type: ignore[attr-defined]
+                    scenario_prompts = [p.name for p in prompts if ":" in p.name]
+                    available = (
+                        "\n    ".join(scenario_prompts) if scenario_prompts else "(none found)"
+                    )
+                except Exception:
+                    available = "(could not fetch available scenarios)"
+
+                raise ValueError(
+                    f"Scenario not found.\n\n"
+                    f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
+                    f"If you only specify 'scenario_name', the SDK uses your task's env name "
+                    f"as the prefix.\n"
+                    f"This won't work if the HUD environment was declared with a different name."
+                    f"\n\n"
+                    f"  You requested: {scenario_name}\n"
+                    f"  SDK looked for: {prompt_id}\n\n"
+                    f"Available scenarios:\n    {available}\n\n"
+                    f"Fix: Use one of the scenario IDs above in your task JSON."
+                ) from e
+
+            # Validate the response (outside try/except so errors aren't wrapped)
+            if result.messages:
+                first_msg = result.messages[0]
+                content = first_msg.content
+                if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
+                    return content.text  # type: ignore[union-attr]
+                elif isinstance(content, str):
+                    return content
+                else:
+                    # Content exists but is neither text object nor string
+                    raise ValueError(
+                        f"Scenario '{scenario_name}' returned malformed content.\n\n"
+                        f"Expected: content with .text attribute (str) or content as str\n"
+                        f"Got: {type(content).__name__}\n\n"
+                        f"Check that the scenario's setup function returns a valid prompt."
+                    )
+            else:
+                # get_prompt succeeded but returned empty messages
+                raise ValueError(
+                    f"Scenario '{scenario_name}' returned an empty response.\n\n"
+                    f"The scenario's setup function was called but returned no messages.\n"
+                    f"Check that the scenario returns a valid prompt string."
+                )
 
     async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
         """Run a scenario's evaluate phase and return the reward.

diff --git a/hud/eval/context.py b/hud/eval/context.py
@@ -539,12 +539,24 @@ async def __aenter__(self) -> Self:
         # Connect environment (MCP servers, tools)
         await super().__aenter__()
 
-        # Run task scenario setup (if created from_task with scenario)
-        await self._run_task_scenario_setup()
-
-        # Notify backend and print link
-        await self._eval_enter()
-        self._print_eval_link()
+        try:
+            # Run task scenario setup (if created from_task with scenario)
+            await self._run_task_scenario_setup()
+
+            # Notify backend and print link
+            await self._eval_enter()
+            self._print_eval_link()
+        except BaseException:
+            # Cleanup if setup fails - __aexit__ won't be called automatically
+            await super().__aexit__(None, None, None)
+            # Reset context vars
+            if self._token is not None:
+                _current_trace_headers.reset(self._token)
+                self._token = None
+            if self._api_key_token is not None:
+                _current_api_key.reset(self._api_key_token)
+                self._api_key_token = None
+            raise
 
         return self