openai
diff --git a/‎docs/ref/checks/custom_prompt_check.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/ref/checks/custom_prompt_check.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/ref/checks/hallucination_detection.md‎
Lines changed: 16 additions & 8 deletions b/‎docs/ref/checks/hallucination_detection.md‎
Lines changed: 16 additions & 8 deletions
diff --git a/‎docs/ref/checks/jailbreak.md‎
Lines changed: 8 additions & 2 deletions b/‎docs/ref/checks/jailbreak.md‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎docs/ref/checks/llm_base.md‎
Lines changed: 7 additions & 1 deletion b/‎docs/ref/checks/llm_base.md‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎docs/ref/checks/nsfw.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/ref/checks/nsfw.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/ref/checks/off_topic_prompts.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/ref/checks/off_topic_prompts.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 11 additions & 2 deletions b/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎src/guardrails/checks/text/hallucination_detection.py‎
Lines changed: 37 additions & 23 deletions b/‎src/guardrails/checks/text/hallucination_detection.py‎
Lines changed: 37 additions & 23 deletions
@@ -22,6 +22,11 @@ Implements custom content checks using configurable LLM prompts. Uses your custo
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`system_prompt_details`** (required): Custom instructions defining the content detection criteria
 - **`max_turns`** (optional): Maximum number of conversation turns to include for multi-turn analysis. Default: 10. Set to 1 for single-turn mode.
+- **`include_reasoning`** (optional): Whether to include reasoning/explanation fields in the guardrail output (default: `false`)
+    - When `false`: The LLM only generates the essential fields (`flagged` and `confidence`), reducing token generation costs
+    - When `true`: Additionally, returns detailed reasoning for its decisions
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 ## Implementation Notes
 
@@ -50,3 +55,4 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`confidence`**: Confidence score (0.0 to 1.0) for the validation
 - **`threshold`**: The confidence threshold that was configured
 - **`token_usage`**: Token usage statistics from the LLM call
+- **`reason`**: Explanation of why the input was flagged (or not flagged) - *only included when `include_reasoning=true`*
@@ -14,7 +14,8 @@ Flags model text containing factual claims that are clearly contradicted or not
     "config": {
         "model": "gpt-4.1-mini",
         "confidence_threshold": 0.7,
-        "knowledge_source": "vs_abc123"
+        "knowledge_source": "vs_abc123",
+        "include_reasoning": false
     }
 }
 ```
@@ -24,6 +25,11 @@ Flags model text containing factual claims that are clearly contradicted or not
 - **`model`** (required): OpenAI model (required) to use for validation (e.g., "gpt-4.1-mini")
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`knowledge_source`** (required): OpenAI vector store ID starting with "vs_" containing reference documents
+- **`include_reasoning`** (optional): Whether to include detailed reasoning fields in the output (default: `false`)
+    - When `false`: Returns only `flagged` and `confidence` to save tokens
+    - When `true`: Additionally, returns `reasoning`, `hallucination_type`, `hallucinated_statements`, and `verified_statements`
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 ### Tuning guidance
 
@@ -102,7 +108,9 @@ See [`examples/hallucination_detection/`](https://github.com/openai/openai-guard
 
 ## What It Returns
 
-Returns a `GuardrailResult` with the following `info` dictionary:
+Returns a `GuardrailResult` with the following `info` dictionary.
+
+**With `include_reasoning=true`:**
 
 ```json
 {
@@ -117,15 +125,15 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 }
 ```
 
+### Fields
+
 - **`flagged`**: Whether the content was flagged as potentially hallucinated
 - **`confidence`**: Confidence score (0.0 to 1.0) for the detection
-- **`reasoning`**: Explanation of why the content was flagged
-- **`hallucination_type`**: Type of issue detected (e.g., "factual_error", "unsupported_claim")
-- **`hallucinated_statements`**: Specific statements that are contradicted or unsupported
-- **`verified_statements`**: Statements that are supported by your documents
 - **`threshold`**: The confidence threshold that was configured
-
-Tip: `hallucination_type` is typically one of `factual_error`, `unsupported_claim`, or `none`.
+- **`reasoning`**: Explanation of why the content was flagged - *only included when `include_reasoning=true`*
+- **`hallucination_type`**: Type of issue detected (e.g., "factual_error", "unsupported_claim", "none") - *only included when `include_reasoning=true`*
+- **`hallucinated_statements`**: Specific statements that are contradicted or unsupported - *only included when `include_reasoning=true`*
+- **`verified_statements`**: Statements that are supported by your documents - *only included when `include_reasoning=true`*
 
 ## Benchmark Results
 
 
@@ -26,7 +26,8 @@ Jailbreak detection focuses on **deception and manipulation tactics** designed t
     "config": {
         "model": "gpt-4.1-mini",
         "confidence_threshold": 0.7,
-        "max_turns": 10
+        "max_turns": 10,
+        "include_reasoning": false
     }
 }
 ```
@@ -35,6 +36,11 @@ Jailbreak detection focuses on **deception and manipulation tactics** designed t
 
 - **`model`** (required): Model to use for detection (e.g., "gpt-4.1-mini")
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
+- **`include_reasoning`** (optional): Whether to include reasoning/explanation fields in the guardrail output (default: `false`)
+    - When `false`: The LLM only generates the essential fields (`flagged` and `confidence`), reducing token generation costs
+    - When `true`: Additionally, returns detailed reasoning for its decisions
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 - **`max_turns`** (optional): Maximum number of conversation turns to include for multi-turn analysis. Default: 10. Set to 1 for single-turn mode.
 
 ## What It Returns
@@ -61,7 +67,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`flagged`**: Whether a jailbreak attempt was detected
 - **`confidence`**: Confidence score (0.0 to 1.0) for the detection
 - **`threshold`**: The confidence threshold that was configured
-- **`reason`**: Explanation of why the input was flagged (or not flagged)
+- **`reason`**: Explanation of why the input was flagged (or not flagged) - *only included when `include_reasoning=true`*
 - **`token_usage`**: Token usage statistics from the LLM call
 
 
 
@@ -10,7 +10,8 @@ Base configuration for LLM-based guardrails. Provides common configuration optio
     "config": {
         "model": "gpt-5",
         "confidence_threshold": 0.7,
-        "max_turns": 10
+        "max_turns": 10,
+        "include_reasoning": false
     }
 }
 ```
@@ -20,6 +21,11 @@ Base configuration for LLM-based guardrails. Provides common configuration optio
 - **`model`** (required): OpenAI model to use for the check (e.g., "gpt-5")
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`max_turns`** (optional): Maximum number of conversation turns to include for multi-turn analysis. Default: 10. Set to 1 for single-turn mode.
+- **`include_reasoning`** (optional): Whether to include reasoning/explanation fields in the guardrail output (default: `false`)
+  - When `true`: The LLM generates and returns detailed reasoning for its decisions (e.g., `reason`, `reasoning`, `observation`, `evidence` fields)
+  - When `false`: The LLM only returns the essential fields (`flagged` and `confidence`), reducing token generation costs
+  - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+  - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 ## What It Does
 
 
@@ -31,6 +31,11 @@ Flags workplace‑inappropriate model outputs: explicit sexual content, profanit
 - **`model`** (required): Model to use for detection (e.g., "gpt-4.1-mini")
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`max_turns`** (optional): Maximum number of conversation turns to include for multi-turn analysis. Default: 10. Set to 1 for single-turn mode.
+- **`include_reasoning`** (optional): Whether to include reasoning/explanation fields in the guardrail output (default: `false`)
+    - When `false`: The LLM only generates the essential fields (`flagged` and `confidence`), reducing token generation costs
+    - When `true`: Additionally, returns detailed reasoning for its decisions
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 ### Tuning guidance
 
@@ -59,6 +64,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`confidence`**: Confidence score (0.0 to 1.0) for the detection
 - **`threshold`**: The confidence threshold that was configured
 - **`token_usage`**: Token usage statistics from the LLM call
+- **`reason`**: Explanation of why the input was flagged (or not flagged) - *only included when `include_reasoning=true`*
 
 ### Examples
 
 
@@ -22,6 +22,11 @@ Ensures content stays within defined business scope using LLM analysis. Flags co
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`system_prompt_details`** (required): Description of your business scope and acceptable topics
 - **`max_turns`** (optional): Maximum number of conversation turns to include for multi-turn analysis. Default: 10. Set to 1 for single-turn mode.
+- **`include_reasoning`** (optional): Whether to include reasoning/explanation fields in the guardrail output (default: `false`)
+    - When `false`: The LLM only generates the essential fields (`flagged` and `confidence`), reducing token generation costs
+    - When `true`: Additionally, returns detailed reasoning for its decisions
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 ## Implementation Notes
 
@@ -50,3 +55,4 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`confidence`**: Confidence score (0.0 to 1.0) for the assessment
 - **`threshold`**: The confidence threshold that was configured
 - **`token_usage`**: Token usage statistics from the LLM call
+- **`reason`**: Explanation of why the input was flagged (or not flagged) - *only included when `include_reasoning=true`*
@@ -32,7 +32,8 @@ After tool execution, the prompt injection detection check validates that the re
     "config": {
         "model": "gpt-4.1-mini",
         "confidence_threshold": 0.7,
-        "max_turns": 10
+        "max_turns": 10,
+        "include_reasoning": false
     }
 }
 ```
@@ -42,6 +43,11 @@ After tool execution, the prompt injection detection check validates that the re
 - **`model`** (required): Model to use for prompt injection detection analysis (e.g., "gpt-4.1-mini")
 - **`confidence_threshold`** (required): Minimum confidence score to trigger tripwire (0.0 to 1.0)
 - **`max_turns`** (optional): Maximum number of user messages to include for determining user intent. Default: 10. Set to 1 to only use the most recent user message.
+- **`include_reasoning`** (optional): Whether to include the `observation` and `evidence` fields in the output (default: `false`)
+    - When `true`: Returns detailed `observation` explaining what the action is doing and `evidence` with specific quotes/details
+    - When `false`: Omits reasoning fields to save tokens (typically 100-300 tokens per check)
+    - **Performance**: In our evaluations, disabling reasoning reduces median latency by 40% on average (ranging from 18% to 67% depending on model) while maintaining detection performance
+    - **Use Case**: Keep disabled for production to minimize costs and latency; enable for development and debugging
 
 **Flags as MISALIGNED:**
 
@@ -79,13 +85,16 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 }
 ```
 
-- **`observation`**: What the AI action is doing
+- **`observation`**: What the AI action is doing - *only included when `include_reasoning=true`*
 - **`flagged`**: Whether the action is misaligned (boolean)
 - **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
+- **`evidence`**: Specific evidence from conversation supporting the decision - *only included when `include_reasoning=true`*
 - **`threshold`**: The confidence threshold that was configured
 - **`user_goal`**: The tracked user intent from conversation
 - **`action`**: The list of function calls or tool outputs analyzed for alignment
 
+**Note**: When `include_reasoning=false` (the default), the `observation` and `evidence` fields are omitted to reduce token generation costs.
+
 ## Benchmark Results
 
 ### Dataset Description
 
@@ -94,8 +94,8 @@ class HallucinationDetectionOutput(LLMOutput):
     Extends the base LLM output with hallucination-specific details.
 
     Attributes:
-        flagged (bool): Whether the content was flagged as potentially hallucinated.
-        confidence (float): Confidence score (0.0 to 1.0) that the input is hallucinated.
+        flagged (bool): Whether the content was flagged as potentially hallucinated (inherited).
+        confidence (float): Confidence score (0.0 to 1.0) that the input is hallucinated (inherited).
         reasoning (str): Detailed explanation of the analysis.
         hallucination_type (str | None): Type of hallucination detected.
         hallucinated_statements (list[str] | None): Specific statements flagged as
@@ -104,16 +104,6 @@ class HallucinationDetectionOutput(LLMOutput):
             by the documents.
     """
 
-    flagged: bool = Field(
-        ...,
-        description="Indicates whether the content was flagged as potentially hallucinated.",
-    )
-    confidence: float = Field(
-        ...,
-        description="Confidence score (0.0 to 1.0) that the input is hallucinated.",
-        ge=0.0,
-        le=1.0,
-    )
     reasoning: str = Field(
         ...,
         description="Detailed explanation of the hallucination analysis.",
@@ -184,14 +174,6 @@ class HallucinationDetectionOutput(LLMOutput):
     3. **Clearly contradicted by the documents** - Claims that directly contradict the documents → FLAG
     4. **Completely unsupported by the documents** - Claims that cannot be verified from the documents → FLAG
 
-    Respond with a JSON object containing:
-    - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
-    - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
-    - "reasoning": string (detailed explanation of your analysis)
-    - "hallucination_type": string (type of issue, if detected: "factual_error", "unsupported_claim", or "none" if supported)
-    - "hallucinated_statements": array of strings (specific factual statements that may be hallucinated)
-    - "verified_statements": array of strings (specific factual statements that are supported by the documents)
-
     **CRITICAL GUIDELINES**:
     - Flag content if ANY factual claims are unsupported or contradicted (even if some claims are supported)
     - Allow conversational, opinion-based, or general content to pass through
@@ -206,6 +188,30 @@ class HallucinationDetectionOutput(LLMOutput):
 ).strip()
 
 
+# Instruction for output format when reasoning is enabled
+REASONING_OUTPUT_INSTRUCTION = textwrap.dedent(
+    """
+    Respond with a JSON object containing:
+    - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
+    - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
+    - "reasoning": string (detailed explanation of your analysis)
+    - "hallucination_type": string (type of issue, if detected: "factual_error", "unsupported_claim", or "none" if supported)
+    - "hallucinated_statements": array of strings (specific factual statements that may be hallucinated)
+    - "verified_statements": array of strings (specific factual statements that are supported by the documents)
+    """
+).strip()
+
+
+# Instruction for output format when reasoning is disabled
+BASE_OUTPUT_INSTRUCTION = textwrap.dedent(
+    """
+    Respond with a JSON object containing:
+    - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
+    - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
+    """
+).strip()
+
+
 async def hallucination_detection(
     ctx: GuardrailLLMContextProto,
     candidate: str,
@@ -242,15 +248,23 @@ async def hallucination_detection(
     )
 
     try:
-        # Create the validation query
-        validation_query = f"{VALIDATION_PROMPT}\n\nText to validate:\n{candidate}"
+        # Build the prompt based on whether reasoning is requested
+        if config.include_reasoning:
+            output_instruction = REASONING_OUTPUT_INSTRUCTION
+            output_format = HallucinationDetectionOutput
+        else:
+            output_instruction = BASE_OUTPUT_INSTRUCTION
+            output_format = LLMOutput
+
+        # Create the validation query with appropriate output instructions
+        validation_query = f"{VALIDATION_PROMPT}\n\n{output_instruction}\n\nText to validate:\n{candidate}"
 
         # Use the Responses API with file search and structured output
         response = await _invoke_openai_callable(
             ctx.guardrail_llm.responses.parse,
             input=validation_query,
             model=config.model,
-            text_format=HallucinationDetectionOutput,
+            text_format=output_format,
             tools=[{"type": "file_search", "vector_store_ids": [config.knowledge_source]}],
         )