diff --git a/hud/agents/openai_chat.py b/hud/agents/openai_chat.py index e4e61b05..0c44d80e 100644 --- a/hud/agents/openai_chat.py +++ b/hud/agents/openai_chat.py @@ -70,6 +70,18 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) super().__init__(params, **kwargs) self.config: OpenAIChatConfig + if ( + self.config.api_key + and self.config.base_url + and settings.hud_gateway_url in self.config.base_url + and settings.api_key + and self.config.api_key != settings.api_key + ): + raise ValueError( + "OpenAIChatAgent api_key is not allowed with HUD Gateway. " + "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys." + ) + if self.config.openai_client is not None: self.oai = self.config.openai_client elif self.config.api_key is not None or self.config.base_url is not None: diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 6ebf0f59..ae0dc125 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -95,6 +95,7 @@ class AgentPreset: # max_concurrent = 30 # max_steps = 10 # group_size = 1 +# byok = false # Remote only; use encrypted env vars on the platform. # task_ids = ["task_1", "task_2"] # verbose = true # very_verbose = true @@ -158,6 +159,7 @@ class EvalConfig(BaseModel): "verbose", "very_verbose", "group_size", + "byok", "remote", "auto_respond", "quiet", @@ -178,6 +180,7 @@ class EvalConfig(BaseModel): very_verbose: bool = False auto_respond: bool | None = None # Continue without prompting (default: True for --full) group_size: int = 1 + byok: bool = False remote: bool = False quiet: bool = False # Suppress opening browser for eval links gateway: bool = False # Use HUD Gateway for LLM API calls @@ -208,6 +211,11 @@ def _parse_agent_type(cls, v: Any) -> AgentType | None: def validate_api_keys(self) -> None: """Validate required API keys for the selected agent. Raises typer.Exit on failure.""" + # BYOK requires remote execution (check before agent_type guard) + if self.byok and not self.remote: + hud_console.error("--byok requires --remote (BYOK only works with remote execution)") + raise typer.Exit(1) + if self.agent_type is None: return @@ -284,14 +292,11 @@ def get_agent_kwargs(self) -> dict[str, Any]: if self.model: kwargs["model"] = self.model - if self.agent_type == AgentType.OPENAI_COMPATIBLE: + # For gateway base_url, inject HUD API key if not already set + if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs: base_url = kwargs.get("base_url", "") - if "api_key" not in kwargs: - # Use HUD API key for gateway, otherwise fall back to OpenAI API key - if settings.hud_gateway_url in base_url: - kwargs["api_key"] = settings.api_key - elif settings.openai_api_key: - kwargs["api_key"] = settings.openai_api_key + if settings.hud_gateway_url in base_url and settings.api_key: + kwargs["api_key"] = settings.api_key # Auto-detect Bedrock when Claude is selected with a Bedrock ARN # Check both model and checkpoint_name for ARN patterns @@ -559,6 +564,8 @@ def display(self) -> None: table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)") if self.gateway: table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)") + if self.byok: + table.add_row("byok", "[bold green]True[/bold green] (remote only)") # Tool filters (only if set) if self.allowed_tools: @@ -659,6 +666,9 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: # Remote execution - submit to HUD platform if cfg.remote: + agent_kwargs = { + k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client") + } # Create a job ID for tracking import uuid @@ -676,6 +686,7 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]: agent_params=agent_kwargs, max_steps=max_steps, group_size=cfg.group_size, + use_byok=cfg.byok, ) hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}") @@ -754,6 +765,11 @@ def eval_command( remote: bool = typer.Option( False, "--remote", help="Submit tasks to platform for remote execution" ), + byok: bool = typer.Option( + False, + "--byok", + help="Remote only: use BYOK keys from encrypted env vars for inference", + ), quiet: bool = typer.Option( False, "--quiet", "-q", help="Suppress opening browser for eval links" ), @@ -790,6 +806,7 @@ def eval_command( group_size=group_size, config=config, remote=remote, + byok=byok, quiet=quiet, gateway=gateway, ) diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py index fabcbfa9..72f0ec0a 100644 --- a/hud/datasets/utils.py +++ b/hud/datasets/utils.py @@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel): description="Additional metadata to inject into the trace context.", ) trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.") + use_byok: bool = Field( + default=False, + description="If True, use BYOK headers from encrypted env vars for inference.", + ) @model_validator(mode="after") def _validate_task(self) -> SingleTaskRequest: @@ -110,6 +114,7 @@ async def submit_rollouts( group_size: int = 1, batch_size: int = 50, metadata: dict[str, Any] | None = None, + use_byok: bool = False, ) -> None: """Submit rollouts to the HUD platform API for remote execution (fire-and-forget). @@ -122,6 +127,7 @@ async def submit_rollouts( group_size: Number of rollouts per task (for variance estimation) batch_size: Number of rollouts per API batch request metadata: Additional metadata for each rollout + use_byok: If True, use BYOK keys from encrypted env vars (remote only) """ from hud.eval.utils import is_v4_format @@ -168,6 +174,7 @@ async def submit_rollouts( trace_name=trace_name, group_id=base_task_id if group_size > 1 else None, metadata=metadata or {}, + use_byok=use_byok, ) )