diff --git a/hud/agents/base.py b/hud/agents/base.py
index e9dd15d8..8c3c04d5 100644
--- a/hud/agents/base.py
+++ b/hud/agents/base.py
@@ -182,7 +182,23 @@ async def run(
             raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
 
         if not ctx.prompt:
-            raise ValueError("ctx.prompt is not set - did the scenario setup run?")
+            if ctx.has_scenario:
+                # Scenario was specified but prompt is still empty
+                # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
+                scenario = ctx._task.scenario if ctx._task else "unknown"
+                raise ValueError(
+                    f"ctx.prompt is not set.\n\n"
+                    f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
+                    f"Check that the scenario's setup function returns a non-empty string."
+                )
+            else:
+                # No scenario specified at all
+                raise ValueError(
+                    "ctx.prompt is not set.\n\n"
+                    "No scenario was specified in your task file.\n"
+                    "Either add a 'scenario' field to your task, or set ctx.prompt manually "
+                    "before running the agent."
+                )
 
         # Store context for tool calls
         self.ctx = ctx
@@ -194,6 +210,11 @@ async def run(
         try:
             result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
 
+            # Propagate error state to context for platform visibility
+            if result.isError and hasattr(ctx, "error"):
+                error_msg = result.info.get("error") if result.info else result.content
+                ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
+
             # Submit final answer to context (only if scenario is running)
             if result.content and ctx.has_scenario:
                 await ctx.submit(result.content)
@@ -202,6 +223,9 @@ async def run(
 
         except Exception as e:
             logger.exception("Error while running agent:")
+            # Propagate error to context for platform visibility
+            if hasattr(ctx, "error"):
+                ctx.error = e
             return Trace(
                 reward=0.0,
                 done=True,
diff --git a/hud/agents/misc/response_agent.py b/hud/agents/misc/response_agent.py
index 528f45ef..617db6a0 100644
--- a/hud/agents/misc/response_agent.py
+++ b/hud/agents/misc/response_agent.py
@@ -1,11 +1,14 @@
 from __future__ import annotations
 
+import logging
 from typing import Literal
 
 from openai import AsyncOpenAI
 
 from hud.settings import settings
 
+logger = logging.getLogger(__name__)
+
 ResponseType = Literal["STOP", "CONTINUE"]
 
 DEFAULT_SYSTEM_PROMPT = """\
@@ -97,5 +100,6 @@ async def determine_response(self, agent_message: str) -> ResponseType:
             else:
                 return "CONTINUE"
 
-        except Exception:
+        except Exception as e:
+            logger.warning("Auto-respond failed: %s", e)
             return "CONTINUE"  # Default to continue on error
diff --git a/hud/agents/openai_chat.py b/hud/agents/openai_chat.py
index e4e61b05..0c44d80e 100644
--- a/hud/agents/openai_chat.py
+++ b/hud/agents/openai_chat.py
@@ -70,6 +70,18 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any)
         super().__init__(params, **kwargs)
         self.config: OpenAIChatConfig
 
+        if (
+            self.config.api_key
+            and self.config.base_url
+            and settings.hud_gateway_url in self.config.base_url
+            and settings.api_key
+            and self.config.api_key != settings.api_key
+        ):
+            raise ValueError(
+                "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
+                "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
+            )
+
         if self.config.openai_client is not None:
             self.oai = self.config.openai_client
         elif self.config.api_key is not None or self.config.base_url is not None:
diff --git a/hud/agents/tests/test_base.py b/hud/agents/tests/test_base.py
index fc2d4706..56565927 100644
--- a/hud/agents/tests/test_base.py
+++ b/hud/agents/tests/test_base.py
@@ -350,3 +350,67 @@ async def test_get_tool_schemas(self) -> None:
         assert len(schemas) == 1
         assert schemas[0]["name"] == "my_tool"
         assert schemas[0]["description"] == "My tool description"
+
+
+class TestMCPAgentErrorPropagation:
+    """Tests for error propagation to EvalContext."""
+
+    @pytest.mark.asyncio
+    async def test_exception_propagates_to_ctx_error(self) -> None:
+        """Test that exceptions during run() set ctx.error for platform visibility."""
+
+        class FailingAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                raise RuntimeError("Agent crashed")
+
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailingAgent()
+
+        result = await agent.run(ctx)
+
+        # Should return error trace
+        assert result.isError is True
+        assert result.content is not None
+        assert "Agent crashed" in result.content
+
+        assert ctx.error is not None
+        assert isinstance(ctx.error, BaseException)
+        assert "Agent crashed" in str(ctx.error)
+
+    @pytest.mark.asyncio
+    async def test_step_error_propagates_to_ctx_error(self) -> None:
+        """Test that step-level errors (caught internally) set ctx.error."""
+        step_count = [0]
+
+        class FailOnSecondStepAgent(MockMCPAgent):
+            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+                step_count[0] += 1
+                if step_count[0] == 1:
+                    return AgentResponse(
+                        content="",
+                        tool_calls=[MCPToolCall(name="test_tool", arguments={})],
+                        done=False,
+                    )
+                else:
+                    raise ValueError("Step 2 failed")
+
+        ctx = MockEvalContext(prompt="Do something")
+        agent = FailOnSecondStepAgent()
+
+        result = await agent.run(ctx)
+
+        # Should return error trace
+        assert result.isError is True
+        assert ctx.error is not None
+        assert "Step 2 failed" in str(ctx.error)
+
+    @pytest.mark.asyncio
+    async def test_no_error_when_successful(self) -> None:
+        """Test that ctx.error remains None on successful run."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
+
+        result = await agent.run(ctx)
+
+        assert result.isError is False
+        assert ctx.error is None
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index 25b1b593..eb13ce34 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -95,6 +95,7 @@ class AgentPreset:
 # max_concurrent = 30
 # max_steps = 10
 # group_size = 1
+# byok = false  # Remote only; use encrypted env vars on the platform.
 # task_ids = ["task_1", "task_2"]
 # verbose = true
 # very_verbose = true
@@ -158,6 +159,7 @@ class EvalConfig(BaseModel):
         "verbose",
         "very_verbose",
         "group_size",
+        "byok",
         "remote",
         "auto_respond",
         "quiet",
@@ -178,6 +180,7 @@ class EvalConfig(BaseModel):
     very_verbose: bool = False
     auto_respond: bool | None = None  # Continue without prompting
     group_size: int = 1
+    byok: bool = False
     remote: bool = False
     quiet: bool = False  # Suppress opening browser for eval links
     gateway: bool = False  # Use HUD Gateway for LLM API calls
@@ -208,6 +211,11 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None:
 
     def validate_api_keys(self) -> None:
         """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
+        # BYOK requires remote execution (check before agent_type guard)
+        if self.byok and not self.remote:
+            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
+            raise typer.Exit(1)
+
         if self.agent_type is None:
             return
 
@@ -284,14 +292,11 @@ def get_agent_kwargs(self) -> dict[str, Any]:
         if self.model:
             kwargs["model"] = self.model
 
-        if self.agent_type == AgentType.OPENAI_COMPATIBLE:
+        # For gateway base_url, inject HUD API key if not already set
+        if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
             base_url = kwargs.get("base_url", "")
-            if "api_key" not in kwargs:
-                # Use HUD API key for gateway, otherwise fall back to OpenAI API key
-                if settings.hud_gateway_url in base_url:
-                    kwargs["api_key"] = settings.api_key
-                elif settings.openai_api_key:
-                    kwargs["api_key"] = settings.openai_api_key
+            if settings.hud_gateway_url in base_url and settings.api_key:
+                kwargs["api_key"] = settings.api_key
 
         # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
         # Check both model and checkpoint_name for ARN patterns
@@ -565,6 +570,8 @@ def display(self) -> None:
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
         if self.gateway:
             table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
+        if self.byok:
+            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
 
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -665,6 +672,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
 
     # Remote execution - submit to HUD platform
     if cfg.remote:
+        agent_kwargs = {
+            k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
+        }
         # Create a job ID for tracking
         import uuid
 
@@ -682,6 +692,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
             agent_params=agent_kwargs,
             max_steps=max_steps,
             group_size=cfg.group_size,
+            use_byok=cfg.byok,
         )
 
         hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
@@ -765,6 +776,11 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
+    byok: bool = typer.Option(
+        False,
+        "--byok",
+        help="Remote only: use BYOK keys from encrypted env vars for inference",
+    ),
     quiet: bool = typer.Option(
         False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
@@ -802,6 +818,7 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
+        byok=byok,
         quiet=quiet,
         gateway=gateway,
     )
diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py
index fabcbfa9..72f0ec0a 100644
--- a/hud/datasets/utils.py
+++ b/hud/datasets/utils.py
@@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
         description="Additional metadata to inject into the trace context.",
     )
     trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
+    use_byok: bool = Field(
+        default=False,
+        description="If True, use BYOK headers from encrypted env vars for inference.",
+    )
 
     @model_validator(mode="after")
     def _validate_task(self) -> SingleTaskRequest:
@@ -110,6 +114,7 @@ async def submit_rollouts(
     group_size: int = 1,
     batch_size: int = 50,
     metadata: dict[str, Any] | None = None,
+    use_byok: bool = False,
 ) -> None:
     """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
 
@@ -122,6 +127,7 @@ async def submit_rollouts(
         group_size: Number of rollouts per task (for variance estimation)
         batch_size: Number of rollouts per API batch request
         metadata: Additional metadata for each rollout
+        use_byok: If True, use BYOK keys from encrypted env vars (remote only)
     """
     from hud.eval.utils import is_v4_format
 
@@ -168,6 +174,7 @@ async def submit_rollouts(
                     trace_name=trace_name,
                     group_id=base_task_id if group_size > 1 else None,
                     metadata=metadata or {},
+                    use_byok=use_byok,
                 )
             )
 
diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
index 1369ea37..dd56373e 100644
--- a/hud/environment/scenarios.py
+++ b/hud/environment/scenarios.py
@@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
                 logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
             try:
                 result = await self.get_prompt(prompt_id, args)  # type: ignore[attr-defined]
-                if result.messages:
-                    first_msg = result.messages[0]
-                    content = first_msg.content
-                    if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
-                        return content.text  # type: ignore[union-attr]
-                    elif isinstance(content, str):
-                        return content
             except Exception as e:
-                logger.warning("Failed to get scenario prompt: %s", e)
-            return None
+                # Fetch available scenarios for error context
+                try:
+                    prompts = await self.list_prompts()  # type: ignore[attr-defined]
+                    scenario_prompts = [p.name for p in prompts if ":" in p.name]
+                    available = (
+                        "\n    ".join(scenario_prompts) if scenario_prompts else "(none found)"
+                    )
+                except Exception:
+                    available = "(could not fetch available scenarios)"
+
+                raise ValueError(
+                    f"Scenario not found.\n\n"
+                    f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
+                    f"If you only specify 'scenario_name', the SDK uses your task's env name "
+                    f"as the prefix.\n"
+                    f"This won't work if the HUD environment was declared with a different name."
+                    f"\n\n"
+                    f"  You requested: {scenario_name}\n"
+                    f"  SDK looked for: {prompt_id}\n\n"
+                    f"Available scenarios:\n    {available}\n\n"
+                    f"Fix: Use one of the scenario IDs above in your task JSON."
+                ) from e
+
+            # Validate the response (outside try/except so errors aren't wrapped)
+            if result.messages:
+                first_msg = result.messages[0]
+                content = first_msg.content
+                if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
+                    return content.text  # type: ignore[union-attr]
+                elif isinstance(content, str):
+                    return content
+                else:
+                    # Content exists but is neither text object nor string
+                    raise ValueError(
+                        f"Scenario '{scenario_name}' returned malformed content.\n\n"
+                        f"Expected: content with .text attribute (str) or content as str\n"
+                        f"Got: {type(content).__name__}\n\n"
+                        f"Check that the scenario's setup function returns a valid prompt."
+                    )
+            else:
+                # get_prompt succeeded but returned empty messages
+                raise ValueError(
+                    f"Scenario '{scenario_name}' returned an empty response.\n\n"
+                    f"The scenario's setup function was called but returned no messages.\n"
+                    f"Check that the scenario returns a valid prompt string."
+                )
 
     async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
         """Run a scenario's evaluate phase and return the reward.
diff --git a/hud/eval/context.py b/hud/eval/context.py
index 20b8b943..77bb752d 100644
--- a/hud/eval/context.py
+++ b/hud/eval/context.py
@@ -539,12 +539,24 @@ async def __aenter__(self) -> Self:
         # Connect environment (MCP servers, tools)
         await super().__aenter__()
 
-        # Run task scenario setup (if created from_task with scenario)
-        await self._run_task_scenario_setup()
-
-        # Notify backend and print link
-        await self._eval_enter()
-        self._print_eval_link()
+        try:
+            # Run task scenario setup (if created from_task with scenario)
+            await self._run_task_scenario_setup()
+
+            # Notify backend and print link
+            await self._eval_enter()
+            self._print_eval_link()
+        except BaseException:
+            # Cleanup if setup fails - __aexit__ won't be called automatically
+            await super().__aexit__(None, None, None)
+            # Reset context vars
+            if self._token is not None:
+                _current_trace_headers.reset(self._token)
+                self._token = None
+            if self._api_key_token is not None:
+                _current_api_key.reset(self._api_key_token)
+                self._api_key_token = None
+            raise
 
         return self
 
diff --git a/hud/eval/instrument.py b/hud/eval/instrument.py
index e950522c..94598f1f 100644
--- a/hud/eval/instrument.py
+++ b/hud/eval/instrument.py
@@ -1,6 +1,6 @@
-"""Auto-instrumentation for httpx to inject trace headers.
+"""Auto-instrumentation for httpx and aiohttp to inject trace headers.
 
-This module patches httpx clients to automatically add:
+This module patches HTTP clients to automatically add:
 - Trace-Id headers when inside an eval context
 - Authorization headers for HUD API calls
 """
@@ -8,9 +8,12 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from urllib.parse import urlparse
 
+if TYPE_CHECKING:
+    from types import SimpleNamespace
+
 from hud.settings import settings
 
 logger = logging.getLogger(__name__)
@@ -70,7 +73,7 @@ async def _async_httpx_request_hook(request: Any) -> None:
     _httpx_request_hook(request)
 
 
-def _instrument_client(client: Any) -> None:
+def _instrument_httpx_client(client: Any) -> None:
     """Add trace hook to an httpx client instance."""
     is_async = hasattr(client, "aclose")
     hook = _async_httpx_request_hook if is_async else _httpx_request_hook
@@ -93,7 +96,7 @@ def _patch_httpx() -> None:
 
     def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
         _original_async_init(self, *args, **kwargs)
-        _instrument_client(self)
+        _instrument_httpx_client(self)
 
     httpx.AsyncClient.__init__ = _patched_async_init  # type: ignore[method-assign]
 
@@ -101,15 +104,65 @@ def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
 
     def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None:
         _original_sync_init(self, *args, **kwargs)
-        _instrument_client(self)
+        _instrument_httpx_client(self)
 
     httpx.Client.__init__ = _patched_sync_init  # type: ignore[method-assign]
 
     logger.debug("httpx auto-instrumentation enabled")
 
 
-# Auto-patch httpx on module import
+def _patch_aiohttp() -> None:
+    """
+    Monkey-patch aiohttp to auto-instrument all ClientSession instances.
+    This is important for the Gemini client in particular, which uses aiohttp by default.
+    """
+    try:
+        import aiohttp
+    except ImportError:
+        logger.debug("aiohttp not installed, skipping auto-instrumentation")
+        return
+
+    async def on_request_start(
+        _session: aiohttp.ClientSession,
+        _trace_config_ctx: SimpleNamespace,
+        params: aiohttp.TraceRequestStartParams,
+    ) -> None:
+        """aiohttp trace hook that adds trace headers and auth to HUD requests."""
+        url_str = str(params.url)
+        if not _is_hud_url(url_str):
+            return
+
+        trace_headers = _get_trace_headers()
+        if trace_headers is not None:
+            for key, value in trace_headers.items():
+                params.headers[key] = value
+            logger.debug("Added trace headers to aiohttp request: %s", url_str)
+
+        has_auth = "authorization" in {k.lower() for k in params.headers}
+        if not has_auth and settings.api_key:
+            params.headers["Authorization"] = f"Bearer {settings.api_key}"
+            logger.debug("Added API key auth to aiohttp request: %s", url_str)
+
+    trace_config = aiohttp.TraceConfig()
+    trace_config.on_request_start.append(on_request_start)
+
+    _original_init = aiohttp.ClientSession.__init__
+
+    def _patched_init(self: aiohttp.ClientSession, *args: Any, **kwargs: Any) -> None:
+        existing_traces = kwargs.get("trace_configs") or []
+        if trace_config not in existing_traces:
+            existing_traces = [*list(existing_traces), trace_config]
+        kwargs["trace_configs"] = existing_traces
+        _original_init(self, *args, **kwargs)
+
+    aiohttp.ClientSession.__init__ = _patched_init  # type: ignore[method-assign]
+
+    logger.debug("aiohttp auto-instrumentation enabled")
+
+
+# Auto-patch on module import
 _patch_httpx()
+_patch_aiohttp()
 
 
-__all__ = ["_patch_httpx"]
+__all__ = ["_patch_aiohttp", "_patch_httpx"]
diff --git a/hud/utils/hud_console.py b/hud/utils/hud_console.py
index d28e87ce..e7fa1f14 100644
--- a/hud/utils/hud_console.py
+++ b/hud/utils/hud_console.py
@@ -20,9 +20,6 @@
 import traceback
 from typing import TYPE_CHECKING, Any, Literal, Self
 
-import questionary
-import typer
-from questionary import Style
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
@@ -488,6 +485,9 @@ def select(
         Returns:
             The selected choice value
         """
+        import questionary
+        from questionary import Style
+
         # Convert choices to questionary format
         q_choices = []
 
@@ -518,6 +518,8 @@ def select(
 
         # If no selection made (Ctrl+C or ESC), exit
         if result is None:
+            import typer
+
             raise typer.Exit(1)
 
         return result
@@ -573,6 +575,8 @@ def confirm(self, message: str, default: bool = True) -> bool:
             message: The confirmation message
             default: If True, the default choice is True
         """
+        import questionary
+
         return questionary.confirm(message, default=default).ask()
 
     # Symbol-based output methods