Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion hud/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,23 @@ async def run(
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")

if not ctx.prompt:
raise ValueError("ctx.prompt is not set - did the scenario setup run?")
if ctx.has_scenario:
# Scenario was specified but prompt is still empty
# (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
scenario = ctx._task.scenario if ctx._task else "unknown"
raise ValueError(
f"ctx.prompt is not set.\n\n"
f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
f"Check that the scenario's setup function returns a non-empty string."
)
else:
# No scenario specified at all
raise ValueError(
"ctx.prompt is not set.\n\n"
"No scenario was specified in your task file.\n"
"Either add a 'scenario' field to your task, or set ctx.prompt manually "
"before running the agent."
)

# Store context for tool calls
self.ctx = ctx
Expand Down
55 changes: 46 additions & 9 deletions hud/environment/scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,53 @@ async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) ->
logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
try:
result = await self.get_prompt(prompt_id, args) # type: ignore[attr-defined]
if result.messages:
first_msg = result.messages[0]
content = first_msg.content
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
return content.text # type: ignore[union-attr]
elif isinstance(content, str):
return content
except Exception as e:
logger.warning("Failed to get scenario prompt: %s", e)
return None
# Fetch available scenarios for error context
try:
prompts = await self.list_prompts() # type: ignore[attr-defined]
scenario_prompts = [p.name for p in prompts if ":" in p.name]
available = (
"\n ".join(scenario_prompts) if scenario_prompts else "(none found)"
)
except Exception:
available = "(could not fetch available scenarios)"

raise ValueError(
f"Scenario not found.\n\n"
f"Scenario IDs have the format 'environment_name:scenario_name'.\n"
f"If you only specify 'scenario_name', the SDK uses your task's env name "
f"as the prefix.\n"
f"This won't work if the HUD environment was declared with a different name."
f"\n\n"
f" You requested: {scenario_name}\n"
f" SDK looked for: {prompt_id}\n\n"
f"Available scenarios:\n {available}\n\n"
f"Fix: Use one of the scenario IDs above in your task JSON."
) from e

# Validate the response (outside try/except so errors aren't wrapped)
if result.messages:
first_msg = result.messages[0]
content = first_msg.content
if hasattr(content, "text") and isinstance(content.text, str): # type: ignore[union-attr]
return content.text # type: ignore[union-attr]
elif isinstance(content, str):
return content
else:
# Content exists but is neither text object nor string
raise ValueError(
f"Scenario '{scenario_name}' returned malformed content.\n\n"
f"Expected: content with .text attribute (str) or content as str\n"
f"Got: {type(content).__name__}\n\n"
f"Check that the scenario's setup function returns a valid prompt."
)
else:
# get_prompt succeeded but returned empty messages
raise ValueError(
f"Scenario '{scenario_name}' returned an empty response.\n\n"
f"The scenario's setup function was called but returned no messages.\n"
f"Check that the scenario returns a valid prompt string."
)

async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
"""Run a scenario's evaluate phase and return the reward.
Expand Down
24 changes: 18 additions & 6 deletions hud/eval/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,12 +544,24 @@ async def __aenter__(self) -> Self:
# Connect environment (MCP servers, tools)
await super().__aenter__()

# Run task scenario setup (if created from_task with scenario)
await self._run_task_scenario_setup()

# Notify backend and print link
await self._eval_enter()
self._print_eval_link()
try:
# Run task scenario setup (if created from_task with scenario)
await self._run_task_scenario_setup()

# Notify backend and print link
await self._eval_enter()
self._print_eval_link()
except BaseException:
# Cleanup if setup fails - __aexit__ won't be called automatically
await super().__aexit__(None, None, None)
# Reset context vars
if self._token is not None:
_current_trace_headers.reset(self._token)
self._token = None
if self._api_key_token is not None:
_current_api_key.reset(self._api_key_token)
self._api_key_token = None
raise

return self

Expand Down
Loading