hud-evals · ryantzr1 · Dec 21, 2025
diff --git a/hud/agents/base.py b/hud/agents/base.py
@@ -182,7 +182,23 @@ async def run(
             raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
 
         if not ctx.prompt:
-            raise ValueError("ctx.prompt is not set - did the scenario setup run?")
+            if ctx.has_scenario:
+                # Scenario was specified but prompt is still empty
+                # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
+                scenario = ctx._task.scenario if ctx._task else "unknown"
+                raise ValueError(
+                    f"ctx.prompt is not set.\n\n"
+                    f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
+                    f"Check that the scenario's setup function returns a non-empty string."
+                )
+            else:
+                # No scenario specified at all
+                raise ValueError(
+                    "ctx.prompt is not set.\n\n"
+                    "No scenario was specified in your task file.\n"
+                    "Either add a 'scenario' field to your task, or set ctx.prompt manually "
+                    "before running the agent."
+                )
 
         # Store context for tool calls
         self.ctx = ctx

diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
@@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
                 logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
             try:
                 result = await self.get_prompt(prompt_id, args)  # type: ignore[attr-defined]
-                if result.messages:
-                    first_msg = result.messages[0]
-                    content = first_msg.content
-                    if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
-                        return content.text  # type: ignore[union-attr]
-                    elif isinstance(content, str):
-                        return content
             except Exception as e:
-                logger.warning("Failed to get scenario prompt: %s", e)
-            return None
+                # Fetch available scenarios for error context
+                try:
+                    prompts = await self.list_prompts()  # type: ignore[attr-defined]
+                    scenario_prompts = [p.name for p in prompts if ":" in p.name]
+                    available = (
+                        "\n    ".join(scenario_prompts) if scenario_prompts else "(none found)"
+                    )
+                except Exception:
+                    available = "(could not fetch available scenarios)"
+
+                raise ValueError(
+                    f"Scenario not found.\n\n"
+                    f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
+                    f"If you only specify 'scenario_name', the SDK uses your task's env name "
+                    f"as the prefix.\n"
+                    f"This won't work if the HUD environment was declared with a different name."
+                    f"\n\n"
+                    f"  You requested: {scenario_name}\n"
+                    f"  SDK looked for: {prompt_id}\n\n"
+                    f"Available scenarios:\n    {available}\n\n"
+                    f"Fix: Use one of the scenario IDs above in your task JSON."
+                ) from e
+
+            # Validate the response (outside try/except so errors aren't wrapped)
+            if result.messages:
+                first_msg = result.messages[0]
+                content = first_msg.content
+                if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
+                    return content.text  # type: ignore[union-attr]
+                elif isinstance(content, str):
+                    return content
+                else:
+                    # Content exists but is neither text object nor string
+                    raise ValueError(
+                        f"Scenario '{scenario_name}' returned malformed content.\n\n"
+                        f"Expected: content with .text attribute (str) or content as str\n"
+                        f"Got: {type(content).__name__}\n\n"
+                        f"Check that the scenario's setup function returns a valid prompt."
+                    )
+            else:
+                # get_prompt succeeded but returned empty messages
+                raise ValueError(
+                    f"Scenario '{scenario_name}' returned an empty response.\n\n"
+                    f"The scenario's setup function was called but returned no messages.\n"
+                    f"Check that the scenario returns a valid prompt string."
+                )
 
     async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
         """Run a scenario's evaluate phase and return the reward.

diff --git a/hud/eval/context.py b/hud/eval/context.py
@@ -544,12 +544,24 @@ async def __aenter__(self) -> Self:
         # Connect environment (MCP servers, tools)
         await super().__aenter__()
 
-        # Run task scenario setup (if created from_task with scenario)
-        await self._run_task_scenario_setup()
-
-        # Notify backend and print link
-        await self._eval_enter()
-        self._print_eval_link()
+        try:
+            # Run task scenario setup (if created from_task with scenario)
+            await self._run_task_scenario_setup()
+
+            # Notify backend and print link
+            await self._eval_enter()
+            self._print_eval_link()
+        except BaseException:
+            # Cleanup if setup fails - __aexit__ won't be called automatically
+            await super().__aexit__(None, None, None)
+            # Reset context vars
+            if self._token is not None:
+                _current_trace_headers.reset(self._token)
+                self._token = None
+            if self._api_key_token is not None:
+                _current_api_key.reset(self._api_key_token)
+                self._api_key_token = None
+            raise
 
         return self