diff --git a/hud/agents/base.py b/hud/agents/base.py index cfe9cf2d..52f71012 100644 --- a/hud/agents/base.py +++ b/hud/agents/base.py @@ -182,7 +182,23 @@ async def run( raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}") if not ctx.prompt: - raise ValueError("ctx.prompt is not set - did the scenario setup run?") + if ctx.has_scenario: + # Scenario was specified but prompt is still empty + # (e.g., scenario returned empty string, or edge case not caught in scenarios.py) + scenario = ctx._task.scenario if ctx._task else "unknown" + raise ValueError( + f"ctx.prompt is not set.\n\n" + f"Scenario '{scenario}' was specified but returned an empty prompt.\n" + f"Check that the scenario's setup function returns a non-empty string." + ) + else: + # No scenario specified at all + raise ValueError( + "ctx.prompt is not set.\n\n" + "No scenario was specified in your task file.\n" + "Either add a 'scenario' field to your task, or set ctx.prompt manually " + "before running the agent." + ) # Store context for tool calls self.ctx = ctx diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py index 1369ea37..dd56373e 100644 --- a/hud/environment/scenarios.py +++ b/hud/environment/scenarios.py @@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id) try: result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined] - if result.messages: - first_msg = result.messages[0] - content = first_msg.content - if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr] - return content.text # type: ignore[union-attr] - elif isinstance(content, str): - return content except Exception as e: - logger.warning("Failed to get scenario prompt: %s", e) - return None + # Fetch available scenarios for error context + try: + prompts = await self.list_prompts() # type: ignore[attr-defined] + scenario_prompts = [p.name for p in prompts if ":" in p.name] + available = ( + "\n ".join(scenario_prompts) if scenario_prompts else "(none found)" + ) + except Exception: + available = "(could not fetch available scenarios)" + + raise ValueError( + f"Scenario not found.\n\n" + f"Scenario IDs have the format 'environment_name:scenario_name'.\n" + f"If you only specify 'scenario_name', the SDK uses your task's env name " + f"as the prefix.\n" + f"This won't work if the HUD environment was declared with a different name." + f"\n\n" + f" You requested: {scenario_name}\n" + f" SDK looked for: {prompt_id}\n\n" + f"Available scenarios:\n {available}\n\n" + f"Fix: Use one of the scenario IDs above in your task JSON." + ) from e + + # Validate the response (outside try/except so errors aren't wrapped) + if result.messages: + first_msg = result.messages[0] + content = first_msg.content + if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr] + return content.text # type: ignore[union-attr] + elif isinstance(content, str): + return content + else: + # Content exists but is neither text object nor string + raise ValueError( + f"Scenario '{scenario_name}' returned malformed content.\n\n" + f"Expected: content with .text attribute (str) or content as str\n" + f"Got: {type(content).__name__}\n\n" + f"Check that the scenario's setup function returns a valid prompt." + ) + else: + # get_prompt succeeded but returned empty messages + raise ValueError( + f"Scenario '{scenario_name}' returned an empty response.\n\n" + f"The scenario's setup function was called but returned no messages.\n" + f"Check that the scenario returns a valid prompt string." + ) async def run_scenario_evaluate(self, scenario_name: str) -> float | None: """Run a scenario's evaluate phase and return the reward. diff --git a/hud/eval/context.py b/hud/eval/context.py index 3e6392e9..5eb6b6f1 100644 --- a/hud/eval/context.py +++ b/hud/eval/context.py @@ -544,12 +544,24 @@ async def __aenter__(self) -> Self: # Connect environment (MCP servers, tools) await super().__aenter__() - # Run task scenario setup (if created from_task with scenario) - await self._run_task_scenario_setup() - - # Notify backend and print link - await self._eval_enter() - self._print_eval_link() + try: + # Run task scenario setup (if created from_task with scenario) + await self._run_task_scenario_setup() + + # Notify backend and print link + await self._eval_enter() + self._print_eval_link() + except BaseException: + # Cleanup if setup fails - __aexit__ won't be called automatically + await super().__aexit__(None, None, None) + # Reset context vars + if self._token is not None: + _current_trace_headers.reset(self._token) + self._token = None + if self._api_key_token is not None: + _current_api_key.reset(self._api_key_token) + self._api_key_token = None + raise return self