From ffd72f56130fe1021f5b58f37550dcb8d9b22f7e Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 11:38:30 -0600
Subject: [PATCH 01/12] Move mock LLMs into top-level benchmark dir, local
 content_safety under examples, and benchmark-specific tests

---
 {nemoguardrails/benchmark => benchmark}/Procfile               | 0
 {nemoguardrails/benchmark => benchmark}/README.md              | 0
 .../benchmark => benchmark}/mock_llm_server/__init__.py        | 0
 {nemoguardrails/benchmark => benchmark}/mock_llm_server/api.py | 1 -
 .../benchmark => benchmark}/mock_llm_server/config.py          | 0
 .../mock_llm_server/configs}/meta-llama-3.3-70b-instruct.env   | 0
 .../configs}/nvidia-llama-3.1-nemoguard-8b-content-safety.env  | 0
 .../benchmark => benchmark}/mock_llm_server/models.py          | 0
 .../benchmark => benchmark}/mock_llm_server/response_data.py   | 1 -
 .../benchmark => benchmark}/mock_llm_server/run_server.py      | 1 -
 {tests/benchmark => benchmark/tests}/test_mock_api.py          | 1 -
 {tests/benchmark => benchmark/tests}/test_mock_config.py       | 3 +--
 {tests/benchmark => benchmark/tests}/test_mock_models.py       | 3 +--
 .../benchmark => benchmark/tests}/test_mock_response_data.py   | 1 -
 {tests/benchmark => benchmark/tests}/test_run_server.py        | 1 -
 .../configs/content_safety_local}/config.yml                   | 0
 .../configs/content_safety_local}/prompts.yml                  | 0
 17 files changed, 2 insertions(+), 10 deletions(-)
 rename {nemoguardrails/benchmark => benchmark}/Procfile (100%)
 rename {nemoguardrails/benchmark => benchmark}/README.md (100%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/__init__.py (100%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/api.py (99%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/config.py (100%)
 rename {nemoguardrails/benchmark/configs/mock_configs => benchmark/mock_llm_server/configs}/meta-llama-3.3-70b-instruct.env (100%)
 rename {nemoguardrails/benchmark/configs/mock_configs => benchmark/mock_llm_server/configs}/nvidia-llama-3.1-nemoguard-8b-content-safety.env (100%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/models.py (100%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/response_data.py (99%)
 rename {nemoguardrails/benchmark => benchmark}/mock_llm_server/run_server.py (99%)
 rename {tests/benchmark => benchmark/tests}/test_mock_api.py (99%)
 rename {tests/benchmark => benchmark/tests}/test_mock_config.py (99%)
 rename {tests/benchmark => benchmark/tests}/test_mock_models.py (99%)
 rename {tests/benchmark => benchmark/tests}/test_mock_response_data.py (99%)
 rename {tests/benchmark => benchmark/tests}/test_run_server.py (99%)
 rename {nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1 => examples/configs/content_safety_local}/config.yml (100%)
 rename {nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1 => examples/configs/content_safety_local}/prompts.yml (100%)

diff --git a/nemoguardrails/benchmark/Procfile b/benchmark/Procfile
similarity index 100%
rename from nemoguardrails/benchmark/Procfile
rename to benchmark/Procfile
diff --git a/nemoguardrails/benchmark/README.md b/benchmark/README.md
similarity index 100%
rename from nemoguardrails/benchmark/README.md
rename to benchmark/README.md
diff --git a/nemoguardrails/benchmark/mock_llm_server/__init__.py b/benchmark/mock_llm_server/__init__.py
similarity index 100%
rename from nemoguardrails/benchmark/mock_llm_server/__init__.py
rename to benchmark/mock_llm_server/__init__.py
diff --git a/nemoguardrails/benchmark/mock_llm_server/api.py b/benchmark/mock_llm_server/api.py
similarity index 99%
rename from nemoguardrails/benchmark/mock_llm_server/api.py
rename to benchmark/mock_llm_server/api.py
index 653634160..be53ce953 100644
--- a/nemoguardrails/benchmark/mock_llm_server/api.py
+++ b/benchmark/mock_llm_server/api.py
@@ -20,7 +20,6 @@
 from typing import Annotated, Union
 
 from fastapi import Depends, FastAPI, HTTPException, Request
-
 from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings, get_settings
 from nemoguardrails.benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
diff --git a/nemoguardrails/benchmark/mock_llm_server/config.py b/benchmark/mock_llm_server/config.py
similarity index 100%
rename from nemoguardrails/benchmark/mock_llm_server/config.py
rename to benchmark/mock_llm_server/config.py
diff --git a/nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env b/benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
similarity index 100%
rename from nemoguardrails/benchmark/configs/mock_configs/meta-llama-3.3-70b-instruct.env
rename to benchmark/mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
diff --git a/nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env b/benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
similarity index 100%
rename from nemoguardrails/benchmark/configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
rename to benchmark/mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
diff --git a/nemoguardrails/benchmark/mock_llm_server/models.py b/benchmark/mock_llm_server/models.py
similarity index 100%
rename from nemoguardrails/benchmark/mock_llm_server/models.py
rename to benchmark/mock_llm_server/models.py
diff --git a/nemoguardrails/benchmark/mock_llm_server/response_data.py b/benchmark/mock_llm_server/response_data.py
similarity index 99%
rename from nemoguardrails/benchmark/mock_llm_server/response_data.py
rename to benchmark/mock_llm_server/response_data.py
index 29b7f5593..75456c212 100644
--- a/nemoguardrails/benchmark/mock_llm_server/response_data.py
+++ b/benchmark/mock_llm_server/response_data.py
@@ -18,7 +18,6 @@
 from typing import Optional
 
 import numpy as np
-
 from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
 
 
diff --git a/nemoguardrails/benchmark/mock_llm_server/run_server.py b/benchmark/mock_llm_server/run_server.py
similarity index 99%
rename from nemoguardrails/benchmark/mock_llm_server/run_server.py
rename to benchmark/mock_llm_server/run_server.py
index 1829bec53..a52574db2 100644
--- a/nemoguardrails/benchmark/mock_llm_server/run_server.py
+++ b/benchmark/mock_llm_server/run_server.py
@@ -26,7 +26,6 @@
 import sys
 
 import uvicorn
-
 from nemoguardrails.benchmark.mock_llm_server.config import CONFIG_FILE_ENV_VAR
 
 # 1. Get a logger instance
diff --git a/tests/benchmark/test_mock_api.py b/benchmark/tests/test_mock_api.py
similarity index 99%
rename from tests/benchmark/test_mock_api.py
rename to benchmark/tests/test_mock_api.py
index ec00a1056..5b1d5b92a 100644
--- a/tests/benchmark/test_mock_api.py
+++ b/benchmark/tests/test_mock_api.py
@@ -17,7 +17,6 @@
 
 import pytest
 from fastapi.testclient import TestClient
-
 from nemoguardrails.benchmark.mock_llm_server.api import app
 from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings, get_settings
 
diff --git a/tests/benchmark/test_mock_config.py b/benchmark/tests/test_mock_config.py
similarity index 99%
rename from tests/benchmark/test_mock_config.py
rename to benchmark/tests/test_mock_config.py
index 1b5950961..6607a6afa 100644
--- a/tests/benchmark/test_mock_config.py
+++ b/benchmark/tests/test_mock_config.py
@@ -15,9 +15,8 @@
 
 
 import pytest
-from pydantic import ValidationError
-
 from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
+from pydantic import ValidationError
 
 
 class TestAppModelConfig:
diff --git a/tests/benchmark/test_mock_models.py b/benchmark/tests/test_mock_models.py
similarity index 99%
rename from tests/benchmark/test_mock_models.py
rename to benchmark/tests/test_mock_models.py
index 9ce686815..6de638f27 100644
--- a/tests/benchmark/test_mock_models.py
+++ b/benchmark/tests/test_mock_models.py
@@ -14,8 +14,6 @@
 # limitations under the License.
 
 import pytest
-from pydantic import ValidationError
-
 from nemoguardrails.benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
     ChatCompletionRequest,
@@ -28,6 +26,7 @@
     ModelsResponse,
     Usage,
 )
+from pydantic import ValidationError
 
 
 class TestMessage:
diff --git a/tests/benchmark/test_mock_response_data.py b/benchmark/tests/test_mock_response_data.py
similarity index 99%
rename from tests/benchmark/test_mock_response_data.py
rename to benchmark/tests/test_mock_response_data.py
index 44a441656..23e99474a 100644
--- a/tests/benchmark/test_mock_response_data.py
+++ b/benchmark/tests/test_mock_response_data.py
@@ -17,7 +17,6 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-
 from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
 from nemoguardrails.benchmark.mock_llm_server.response_data import (
     calculate_tokens,
diff --git a/tests/benchmark/test_run_server.py b/benchmark/tests/test_run_server.py
similarity index 99%
rename from tests/benchmark/test_run_server.py
rename to benchmark/tests/test_run_server.py
index 2d8b7a043..95d8d624a 100644
--- a/tests/benchmark/test_run_server.py
+++ b/benchmark/tests/test_run_server.py
@@ -17,7 +17,6 @@
 from unittest.mock import patch
 
 import pytest
-
 from nemoguardrails.benchmark.mock_llm_server.run_server import (
     parse_arguments,
     validate_config_file,
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml b/examples/configs/content_safety_local/config.yml
similarity index 100%
rename from nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/config.yml
rename to examples/configs/content_safety_local/config.yml
diff --git a/nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml b/examples/configs/content_safety_local/prompts.yml
similarity index 100%
rename from nemoguardrails/benchmark/configs/guardrail_configs/content_safety_colang1/prompts.yml
rename to examples/configs/content_safety_local/prompts.yml

From d5502acc6d46c54d5ed926091b985049c2313dc2 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 11:46:34 -0600
Subject: [PATCH 02/12] Initial checkin of validation script

---
 benchmark/scripts/validate_mocks.sh | 274 ++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100755 benchmark/scripts/validate_mocks.sh

diff --git a/benchmark/scripts/validate_mocks.sh b/benchmark/scripts/validate_mocks.sh
new file mode 100755
index 000000000..78f05f40e
--- /dev/null
+++ b/benchmark/scripts/validate_mocks.sh
@@ -0,0 +1,274 @@
+#!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# A script to check the health and model IDs of local OpenAI-compatible endpoints.
+# Requires: curl, jq
+#
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+NC='\033[0m' # No Color
+
+TIMEOUT=3
+SUMMARIES=()
+ALL_PASSED=true
+
+log_info() {
+    echo -e "$1"
+}
+
+log_error() {
+    echo -e "${RED}$1${NC}" >&2
+}
+
+log_warning() {
+    echo -e "${YELLOW}$1${NC}"
+}
+
+log_success() {
+    echo -e "${GREEN}$1${NC}"
+}
+
+# Check if required commands are available
+check_dependencies() {
+    for cmd in curl jq; do
+        if ! command -v "$cmd" &> /dev/null; then
+            log_error "Error: '$cmd' is required but not installed."
+            exit 1
+        fi
+    done
+}
+
+# Check an OpenAI-compatible endpoint for health and model availability
+# Arguments: port, expected_model
+check_endpoint() {
+    local port=$1
+    local expected_model=$2
+    local base_url="http://localhost:${port}"
+    local all_ok=true
+
+    log_info "\n--- Checking Port: ${port} ---"
+
+    # --- 1. Health Check ---
+    local health_url="${base_url}/health"
+    log_info "Checking ${health_url} ..."
+
+    local response
+    local http_code
+
+    # Capture curl exit code to distinguish between connection error and timeout
+    local curl_exit_code
+    response=$(curl -s -w "\n%{http_code}" --max-time "$TIMEOUT" "$health_url" 2>/dev/null) || curl_exit_code=$?
+
+    if [[ -n "${curl_exit_code:-}" ]]; then
+        if [[ "$curl_exit_code" -eq 28 ]]; then
+            log_error "Health Check FAILED: Connection timed out for port ${port}."
+            log_error "--- Port ${port}: CHECKS FAILED ---"
+            SUMMARIES+=("Port ${port} (${expected_model}): FAILED (Connection Timeout)")
+        else
+            log_error "Health Check FAILED: No response from server on port ${port}."
+            log_error "--- Port ${port}: CHECKS FAILED ---"
+            SUMMARIES+=("Port ${port} (${expected_model}): FAILED (Connection Error)")
+        fi
+        ALL_PASSED=false
+        return 1
+    fi
+
+    http_code=$(echo "$response" | tail -n1)
+    local body
+    body=$(echo "$response" | sed '$d')
+
+    if [[ "$http_code" != "200" ]]; then
+        log_error "Health Check FAILED: Status code ${http_code}"
+        all_ok=false
+    else
+        local status
+        if status=$(echo "$body" | jq -r '.status' 2>/dev/null); then
+            if [[ "$status" == "healthy" ]]; then
+                log_success "Health Check PASSED: Status is 'healthy'."
+            else
+                log_warning "Health Check FAILED: Expected 'healthy', got '${status}'."
+                all_ok=false
+            fi
+        else
+            log_error "Health Check FAILED: Could not decode JSON response."
+            all_ok=false
+        fi
+    fi
+
+    # --- 2. Model Check ---
+    local models_url="${base_url}/v1/models"
+    log_info "Checking ${models_url} for '${expected_model}'..."
+
+    # Capture curl exit code to distinguish between connection error and timeout
+    curl_exit_code=""
+    response=$(curl -s -w "\n%{http_code}" --max-time "$TIMEOUT" "$models_url" 2>/dev/null) || curl_exit_code=$?
+
+    if [[ -n "${curl_exit_code:-}" ]]; then
+        if [[ "$curl_exit_code" -eq 28 ]]; then
+            log_error "Model Check FAILED: Connection timed out for port ${port}."
+        else
+            log_error "Model Check FAILED: No response from server on port ${port}."
+        fi
+        all_ok=false
+    else
+        http_code=$(echo "$response" | tail -n1)
+        body=$(echo "$response" | sed '$d')
+
+        if [[ "$http_code" != "200" ]]; then
+            log_error "Model Check FAILED: Status code ${http_code}"
+            all_ok=false
+        else
+            local model_ids
+            if model_ids=$(echo "$body" | jq -r '.data[].id' 2>/dev/null); then
+                if echo "$model_ids" | grep -qx "$expected_model"; then
+                    log_success "Model Check PASSED: Found '${expected_model}' in model list."
+                else
+                    log_warning "Model Check FAILED: Expected '${expected_model}', but it was NOT found."
+                    log_warning "Available models:"
+                    echo "$model_ids" | while read -r model_id; do
+                        log_warning "  - ${model_id}"
+                    done
+                    all_ok=false
+                fi
+            else
+                log_error "Model Check FAILED: Could not decode JSON response."
+                all_ok=false
+            fi
+        fi
+    fi
+
+    # --- Final Status ---
+    if [[ "$all_ok" == true ]]; then
+        log_success "--- Port ${port}: ALL CHECKS PASSED ---"
+        SUMMARIES+=("Port ${port} (${expected_model}): PASSED")
+        return 0
+    else
+        log_error "--- Port ${port}: CHECKS FAILED ---"
+        SUMMARIES+=("Port ${port} (${expected_model}): FAILED")
+        ALL_PASSED=false
+        return 1
+    fi
+}
+
+# Check the Rails configuration endpoint
+# Arguments: port
+check_rails_endpoint() {
+    local port=$1
+    local base_url="http://localhost:${port}"
+    local endpoint="${base_url}/v1/rails/configs"
+    local all_ok=true
+
+    log_info "\n--- Checking Port: ${port} (Rails Config) ---"
+    log_info "Checking ${endpoint} ..."
+
+    local response
+    local http_code
+    local curl_exit_code=""
+
+    # Capture curl exit code to distinguish between connection error and timeout
+    response=$(curl -s -w "\n%{http_code}" --max-time "$TIMEOUT" "$endpoint" 2>/dev/null) || curl_exit_code=$?
+
+    if [[ -n "${curl_exit_code:-}" ]]; then
+        if [[ "$curl_exit_code" -eq 28 ]]; then
+            log_error "Rails Check FAILED: Connection timed out for port ${port}."
+        else
+            log_error "Rails Check FAILED: No response from server on port ${port}."
+        fi
+        all_ok=false
+    else
+        http_code=$(echo "$response" | tail -n1)
+        local body
+        body=$(echo "$response" | sed '$d')
+
+        # --- 1. HTTP Status Check ---
+        if [[ "$http_code" == "200" ]]; then
+            log_success "HTTP Status PASSED: Got ${http_code}."
+        else
+            log_warning "HTTP Status FAILED: Expected 200, got '${http_code}'."
+            all_ok=false
+        fi
+
+        # --- 2. Body Content Check ---
+        local is_array
+        local array_len
+
+        if is_array=$(echo "$body" | jq 'if type == "array" then true else false end' 2>/dev/null); then
+            if [[ "$is_array" == "true" ]]; then
+                array_len=$(echo "$body" | jq 'length' 2>/dev/null)
+                if [[ "$array_len" -gt 0 ]]; then
+                    log_success "Body Check PASSED: Response is an array with at least one entry."
+                else
+                    log_warning "Body Check FAILED: Response is not an array or is empty."
+                    all_ok=false
+                fi
+            else
+                log_warning "Body Check FAILED: Response is not an array or is empty."
+                all_ok=false
+            fi
+        else
+            log_error "Body Check FAILED: Could not decode JSON response."
+            all_ok=false
+        fi
+    fi
+
+    # --- Final Status ---
+    if [[ "$all_ok" == true ]]; then
+        log_success "--- Port ${port}: ALL CHECKS PASSED ---"
+        SUMMARIES+=("Port ${port} (Rails Config): PASSED")
+        return 0
+    else
+        log_error "--- Port ${port}: CHECKS FAILED ---"
+        SUMMARIES+=("Port ${port} (Rails Config): FAILED")
+        ALL_PASSED=false
+        return 1
+    fi
+}
+
+main() {
+    log_info "Starting LLM endpoint health check..."
+
+    check_dependencies
+
+    # Run all checks (allow individual failures without exiting)
+    check_endpoint 8000 "meta/llama-3.3-70b-instruct" || true
+    check_endpoint 8001 "nvidia/llama-3.1-nemoguard-8b-content-safety" || true
+    check_rails_endpoint 9000 || true
+
+    log_info "\n--- Final Summary ---"
+
+    for summary in "${SUMMARIES[@]}"; do
+        log_info "$summary"
+    done
+
+    log_info "---------------------"
+
+    if [[ "$ALL_PASSED" == true ]]; then
+        log_success "Overall Status: All endpoints are healthy!"
+        exit 0
+    else
+        log_error "Overall Status: One or more checks FAILED."
+        exit 1
+    fi
+}
+
+main "$@"

From 9abe39a4bd8133af07a410a59377bfbb20342c8f Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 11:48:21 -0600
Subject: [PATCH 03/12] Remove un-needed files under nemoguardrails/benchmark

---
 nemoguardrails/benchmark/__init__.py       |  14 --
 nemoguardrails/benchmark/validate_mocks.py | 213 ---------------------
 2 files changed, 227 deletions(-)
 delete mode 100644 nemoguardrails/benchmark/__init__.py
 delete mode 100644 nemoguardrails/benchmark/validate_mocks.py

diff --git a/nemoguardrails/benchmark/__init__.py b/nemoguardrails/benchmark/__init__.py
deleted file mode 100644
index 6c7f64065..000000000
--- a/nemoguardrails/benchmark/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemoguardrails/benchmark/validate_mocks.py b/nemoguardrails/benchmark/validate_mocks.py
deleted file mode 100644
index f3b61dc1b..000000000
--- a/nemoguardrails/benchmark/validate_mocks.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#!/usr/bin/env python3
-
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-A script to check the health and model IDs of local OpenAI-compatible endpoints.
-Requires the 'httpx' library: pip install httpx
-"""
-
-import json
-import logging
-import sys
-
-import httpx
-
-# --- Logging Setup ---
-# Configure basic logging to print info-level messages
-logging.basicConfig(level=logging.INFO, format="%(message)s")
-
-
-def check_endpoint(port: int, expected_model: str):
-    """
-    Checks the /health and /v1/models endpoints for a standard
-    OpenAI-compatible server.
-    Returns a tuple: (bool success, str summary)
-    """
-    base_url = f"http://localhost:{port}"
-    all_ok = True
-
-    logging.info("\n--- Checking Port: %s ---", port)
-
-    # --- 1. Health Check ---
-    health_url = f"{base_url}/health"
-    logging.info("Checking %s ...", health_url)
-    try:
-        response = httpx.get(health_url, timeout=3)
-
-        if response.status_code != 200:
-            logging.error("Health Check FAILED: Status code %s", response.status_code)
-            all_ok = False
-        else:
-            try:
-                data = response.json()
-                status = data.get("status")
-                if status == "healthy":
-                    logging.info("Health Check PASSED: Status is 'healthy'.")
-                else:
-                    logging.warning("Health Check FAILED: Expected 'healthy', got '%s'.", status)
-                    all_ok = False
-            except json.JSONDecodeError:
-                logging.error("Health Check FAILED: Could not decode JSON response.")
-                all_ok = False
-
-    except httpx.ConnectError:
-        logging.error("Health Check FAILED: No response from server on port %s.", port)
-        logging.error("--- Port %s: CHECKS FAILED ---", port)
-        return False, "Port %s (%s): FAILED (Connection Error)" % (port, expected_model)
-    except httpx.TimeoutException:
-        logging.error("Health Check FAILED: Connection timed out for port %s.", port)
-        logging.error("--- Port %s: CHECKS FAILED ---", port)
-        return False, "Port %s (%s): FAILED (Connection Timeout)" % (
-            port,
-            expected_model,
-        )
-
-    # --- 2. Model Check ---
-    models_url = f"{base_url}/v1/models"
-    logging.info("Checking %s for '%s'...", models_url, expected_model)
-    try:
-        response = httpx.get(models_url, timeout=3)
-
-        if response.status_code != 200:
-            logging.error("Model Check FAILED: Status code %s", response.status_code)
-            all_ok = False
-        else:
-            try:
-                data = response.json()
-                models = data.get("data", [])
-                model_ids = [model.get("id") for model in models]
-
-                if expected_model in model_ids:
-                    logging.info("Model Check PASSED: Found '%s' in model list.", expected_model)
-                else:
-                    logging.warning(
-                        "Model Check FAILED: Expected '%s', but it was NOT found.",
-                        expected_model,
-                    )
-                    logging.warning("Available models:")
-                    for model_id in model_ids:
-                        logging.warning("  - %s", model_id)
-                    all_ok = False
-            except json.JSONDecodeError:
-                logging.error("Model Check FAILED: Could not decode JSON response.")
-                all_ok = False
-            except AttributeError:
-                logging.error(
-                    "Model Check FAILED: Unexpected JSON structure in response from %s.",
-                    models_url,
-                )
-                all_ok = False
-
-    except httpx.ConnectError:
-        logging.error("Model Check FAILED: No response from server on port %s.", port)
-        all_ok = False
-    except httpx.TimeoutException:
-        logging.error("Model Check FAILED: Connection timed out for port %s.", port)
-        all_ok = False
-
-    # --- Final Status ---
-    if all_ok:
-        logging.info("--- Port %s: ALL CHECKS PASSED ---", port)
-        return True, "Port %s (%s): PASSED" % (port, expected_model)
-    else:
-        logging.error("--- Port %s: CHECKS FAILED ---", port)
-        return False, "Port %s (%s): FAILED" % (port, expected_model)
-
-
-def check_rails_endpoint(port: int):
-    """
-    Checks the /v1/rails/configs endpoint for a specific 200 status
-    and a non-empty list response.
-    Returns a tuple: (bool success, str summary)
-    """
-    base_url = f"http://localhost:{port}"
-    endpoint = f"{base_url}/v1/rails/configs"
-    all_ok = True
-
-    logging.info("\n--- Checking Port: %s (Rails Config) ---", port)
-    logging.info("Checking %s ...", endpoint)
-
-    try:
-        response = httpx.get(endpoint, timeout=3)
-
-        # --- 1. HTTP Status Check ---
-        if response.status_code == 200:
-            logging.info("HTTP Status PASSED: Got %s.", response.status_code)
-        else:
-            logging.warning("HTTP Status FAILED: Expected 200, got '%s'.", response.status_code)
-            all_ok = False
-
-        # --- 2. Body Content Check ---
-        try:
-            data = response.json()
-            if isinstance(data, list) and len(data) > 0:
-                logging.info("Body Check PASSED: Response is an array with at least one entry.")
-            else:
-                logging.warning("Body Check FAILED: Response is not an array or is empty.")
-                logging.debug("Response body (first 200 chars): %s", str(response.text)[:200])
-                all_ok = False
-        except json.JSONDecodeError:
-            logging.error("Body Check FAILED: Could not decode JSON response.")
-            logging.debug("Response body (first 200 chars): %s", str(response.text)[:200])
-            all_ok = False
-
-    except httpx.ConnectError:
-        logging.error("Rails Check FAILED: No response from server on port %s.", port)
-        all_ok = False
-    except httpx.TimeoutException:
-        logging.error("Rails Check FAILED: Connection timed out for port %s.", port)
-        all_ok = False
-
-    # --- Final Status ---
-    if all_ok:
-        logging.info("--- Port %s: ALL CHECKS PASSED ---", port)
-        return True, "Port %s (Rails Config): PASSED" % port
-    else:
-        logging.error("--- Port %s: CHECKS FAILED ---", port)
-        return False, "Port %s (Rails Config): FAILED" % port
-
-
-def main():
-    """Run all health checks."""
-    logging.info("Starting LLM endpoint health check...")
-
-    check_results = [
-        check_endpoint(8000, "meta/llama-3.3-70b-instruct"),
-        check_endpoint(8001, "nvidia/llama-3.1-nemoguard-8b-content-safety"),
-        check_rails_endpoint(9000),
-    ]
-
-    logging.info("\n--- Final Summary ---")
-
-    all_passed = True
-    for success, summary in check_results:
-        logging.info(summary)
-        if not success:
-            all_passed = False
-
-    logging.info("---------------------")
-
-    if all_passed:
-        logging.info("Overall Status: All endpoints are healthy!")
-        sys.exit(0)
-    else:
-        logging.error("Overall Status: One or more checks FAILED.")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()  # pragma: no cover

From 811ac9fbce8ecb9782ec7b05f044adcc0ce54db4 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 11:54:14 -0600
Subject: [PATCH 04/12] Move unit-tests under benchamrk top-level dir

---
 {tests/benchmark => benchmark/tests}/mock_model_config.yaml | 0
 {tests/benchmark => benchmark/tests}/test_aiperf_models.py  | 0
 {tests/benchmark => benchmark/tests}/test_run_aiperf.py     | 0
 {tests/benchmark => benchmark/tests}/test_validate_mocks.py | 1 -
 4 files changed, 1 deletion(-)
 rename {tests/benchmark => benchmark/tests}/mock_model_config.yaml (100%)
 rename {tests/benchmark => benchmark/tests}/test_aiperf_models.py (100%)
 rename {tests/benchmark => benchmark/tests}/test_run_aiperf.py (100%)
 rename {tests/benchmark => benchmark/tests}/test_validate_mocks.py (99%)

diff --git a/tests/benchmark/mock_model_config.yaml b/benchmark/tests/mock_model_config.yaml
similarity index 100%
rename from tests/benchmark/mock_model_config.yaml
rename to benchmark/tests/mock_model_config.yaml
diff --git a/tests/benchmark/test_aiperf_models.py b/benchmark/tests/test_aiperf_models.py
similarity index 100%
rename from tests/benchmark/test_aiperf_models.py
rename to benchmark/tests/test_aiperf_models.py
diff --git a/tests/benchmark/test_run_aiperf.py b/benchmark/tests/test_run_aiperf.py
similarity index 100%
rename from tests/benchmark/test_run_aiperf.py
rename to benchmark/tests/test_run_aiperf.py
diff --git a/tests/benchmark/test_validate_mocks.py b/benchmark/tests/test_validate_mocks.py
similarity index 99%
rename from tests/benchmark/test_validate_mocks.py
rename to benchmark/tests/test_validate_mocks.py
index fb5e61d07..dfa227b9d 100644
--- a/tests/benchmark/test_validate_mocks.py
+++ b/benchmark/tests/test_validate_mocks.py
@@ -24,7 +24,6 @@
 
 import httpx
 import pytest
-
 from nemoguardrails.benchmark.validate_mocks import (
     check_endpoint,
     check_rails_endpoint,

From 88c04d9b2fd05a9a8dbbf94760dfc7ae936aa7ba Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 12:35:44 -0600
Subject: [PATCH 05/12] Update unit-tests with new code location

---
 benchmark/mock_llm_server/api.py           |   7 +-
 benchmark/mock_llm_server/response_data.py |   3 +-
 benchmark/mock_llm_server/run_server.py    |   3 +-
 benchmark/tests/test_mock_api.py           |   5 +-
 benchmark/tests/test_mock_config.py        |   3 +-
 benchmark/tests/test_mock_models.py        |   5 +-
 benchmark/tests/test_mock_response_data.py |  29 +-
 benchmark/tests/test_run_server.py         |   6 +-
 benchmark/tests/test_validate_mocks.py     | 429 ---------------------
 pytest.ini                                 |   1 +
 10 files changed, 34 insertions(+), 457 deletions(-)
 delete mode 100644 benchmark/tests/test_validate_mocks.py

diff --git a/benchmark/mock_llm_server/api.py b/benchmark/mock_llm_server/api.py
index be53ce953..1dac1c2ee 100644
--- a/benchmark/mock_llm_server/api.py
+++ b/benchmark/mock_llm_server/api.py
@@ -20,8 +20,9 @@
 from typing import Annotated, Union
 
 from fastapi import Depends, FastAPI, HTTPException, Request
-from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings, get_settings
-from nemoguardrails.benchmark.mock_llm_server.models import (
+
+from benchmark.mock_llm_server.config import ModelSettings, get_settings
+from benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -33,7 +34,7 @@
     ModelsResponse,
     Usage,
 )
-from nemoguardrails.benchmark.mock_llm_server.response_data import (
+from benchmark.mock_llm_server.response_data import (
     calculate_tokens,
     generate_id,
     get_latency_seconds,
diff --git a/benchmark/mock_llm_server/response_data.py b/benchmark/mock_llm_server/response_data.py
index 75456c212..3a9afe64f 100644
--- a/benchmark/mock_llm_server/response_data.py
+++ b/benchmark/mock_llm_server/response_data.py
@@ -18,7 +18,8 @@
 from typing import Optional
 
 import numpy as np
-from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
+
+from benchmark.mock_llm_server.config import ModelSettings
 
 
 def generate_id(prefix: str = "chatcmpl") -> str:
diff --git a/benchmark/mock_llm_server/run_server.py b/benchmark/mock_llm_server/run_server.py
index a52574db2..c8b69ba9d 100644
--- a/benchmark/mock_llm_server/run_server.py
+++ b/benchmark/mock_llm_server/run_server.py
@@ -26,7 +26,8 @@
 import sys
 
 import uvicorn
-from nemoguardrails.benchmark.mock_llm_server.config import CONFIG_FILE_ENV_VAR
+
+from benchmark.mock_llm_server.config import CONFIG_FILE_ENV_VAR
 
 # 1. Get a logger instance
 log = logging.getLogger(__name__)
diff --git a/benchmark/tests/test_mock_api.py b/benchmark/tests/test_mock_api.py
index 5b1d5b92a..96d665dca 100644
--- a/benchmark/tests/test_mock_api.py
+++ b/benchmark/tests/test_mock_api.py
@@ -17,8 +17,9 @@
 
 import pytest
 from fastapi.testclient import TestClient
-from nemoguardrails.benchmark.mock_llm_server.api import app
-from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings, get_settings
+
+from benchmark.mock_llm_server.api import app
+from benchmark.mock_llm_server.config import ModelSettings, get_settings
 
 
 def get_test_settings():
diff --git a/benchmark/tests/test_mock_config.py b/benchmark/tests/test_mock_config.py
index 6607a6afa..4b9ac6231 100644
--- a/benchmark/tests/test_mock_config.py
+++ b/benchmark/tests/test_mock_config.py
@@ -15,9 +15,10 @@
 
 
 import pytest
-from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
 from pydantic import ValidationError
 
+from benchmark.mock_llm_server.config import ModelSettings
+
 
 class TestAppModelConfig:
     """Test the AppModelConfig Pydantic model."""
diff --git a/benchmark/tests/test_mock_models.py b/benchmark/tests/test_mock_models.py
index 6de638f27..31f390249 100644
--- a/benchmark/tests/test_mock_models.py
+++ b/benchmark/tests/test_mock_models.py
@@ -14,7 +14,9 @@
 # limitations under the License.
 
 import pytest
-from nemoguardrails.benchmark.mock_llm_server.models import (
+from pydantic import ValidationError
+
+from benchmark.mock_llm_server.models import (
     ChatCompletionChoice,
     ChatCompletionRequest,
     ChatCompletionResponse,
@@ -26,7 +28,6 @@
     ModelsResponse,
     Usage,
 )
-from pydantic import ValidationError
 
 
 class TestMessage:
diff --git a/benchmark/tests/test_mock_response_data.py b/benchmark/tests/test_mock_response_data.py
index 23e99474a..7b1716fcf 100644
--- a/benchmark/tests/test_mock_response_data.py
+++ b/benchmark/tests/test_mock_response_data.py
@@ -17,8 +17,9 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-from nemoguardrails.benchmark.mock_llm_server.config import ModelSettings
-from nemoguardrails.benchmark.mock_llm_server.response_data import (
+
+from benchmark.mock_llm_server.config import ModelSettings
+from benchmark.mock_llm_server.response_data import (
     calculate_tokens,
     generate_id,
     get_latency_seconds,
@@ -114,8 +115,8 @@ def random_seed() -> int:
     return 12345
 
 
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.seed")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.binomial")
+@patch("benchmark.mock_llm_server.response_data.np.random.seed")
+@patch("benchmark.mock_llm_server.response_data.np.random.binomial")
 def test_is_unsafe_mocks_no_seed(mock_binomial: MagicMock, mock_seed: MagicMock, model_settings: ModelSettings):
     """Check `is_unsafe()` calls the correct numpy functions"""
     mock_binomial.return_value = [True]
@@ -128,8 +129,8 @@ def test_is_unsafe_mocks_no_seed(mock_binomial: MagicMock, mock_seed: MagicMock,
     mock_binomial.assert_called_once_with(n=1, p=model_settings.unsafe_probability, size=1)
 
 
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.seed")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.binomial")
+@patch("benchmark.mock_llm_server.response_data.np.random.seed")
+@patch("benchmark.mock_llm_server.response_data.np.random.binomial")
 def test_is_unsafe_mocks_with_seed(mock_binomial, mock_seed, model_settings: ModelSettings, random_seed: int):
     """Check `is_unsafe()` calls the correct numpy functions"""
     mock_binomial.return_value = [False]
@@ -160,7 +161,7 @@ def test_is_unsafe_prob_zero(model_settings: ModelSettings):
 
 def test_get_response_safe(model_settings: ModelSettings):
     """Check we get the safe response with is_unsafe returns False"""
-    with patch("nemoguardrails.benchmark.mock_llm_server.response_data.is_unsafe") as mock_is_unsafe:
+    with patch("benchmark.mock_llm_server.response_data.is_unsafe") as mock_is_unsafe:
         mock_is_unsafe.return_value = False
         response = get_response(model_settings)
         assert response == model_settings.safe_text
@@ -168,15 +169,15 @@ def test_get_response_safe(model_settings: ModelSettings):
 
 def test_get_response_unsafe(model_settings: ModelSettings):
     """Check we get the safe response with is_unsafe returns False"""
-    with patch("nemoguardrails.benchmark.mock_llm_server.response_data.is_unsafe") as mock_is_unsafe:
+    with patch("benchmark.mock_llm_server.response_data.is_unsafe") as mock_is_unsafe:
         mock_is_unsafe.return_value = True
         response = get_response(model_settings)
         assert response == model_settings.unsafe_text
 
 
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.seed")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.normal")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.clip")
+@patch("benchmark.mock_llm_server.response_data.np.random.seed")
+@patch("benchmark.mock_llm_server.response_data.np.random.normal")
+@patch("benchmark.mock_llm_server.response_data.np.clip")
 def test_get_latency_seconds_mocks_no_seed(mock_clip, mock_normal, mock_seed, model_settings: ModelSettings):
     """Check we call the correct numpy functions (not including seed)"""
 
@@ -199,9 +200,9 @@ def test_get_latency_seconds_mocks_no_seed(mock_clip, mock_normal, mock_seed, mo
     )
 
 
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.seed")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.random.normal")
-@patch("nemoguardrails.benchmark.mock_llm_server.response_data.np.clip")
+@patch("benchmark.mock_llm_server.response_data.np.random.seed")
+@patch("benchmark.mock_llm_server.response_data.np.random.normal")
+@patch("benchmark.mock_llm_server.response_data.np.clip")
 def test_get_latency_seconds_mocks_with_seed(
     mock_clip, mock_normal, mock_seed, model_settings: ModelSettings, random_seed: int
 ):
diff --git a/benchmark/tests/test_run_server.py b/benchmark/tests/test_run_server.py
index 95d8d624a..06a360d7f 100644
--- a/benchmark/tests/test_run_server.py
+++ b/benchmark/tests/test_run_server.py
@@ -17,10 +17,8 @@
 from unittest.mock import patch
 
 import pytest
-from nemoguardrails.benchmark.mock_llm_server.run_server import (
-    parse_arguments,
-    validate_config_file,
-)
+
+from benchmark.mock_llm_server.run_server import parse_arguments, validate_config_file
 
 
 class TestParseArguments:
diff --git a/benchmark/tests/test_validate_mocks.py b/benchmark/tests/test_validate_mocks.py
deleted file mode 100644
index dfa227b9d..000000000
--- a/benchmark/tests/test_validate_mocks.py
+++ /dev/null
@@ -1,429 +0,0 @@
-#!/usr/bin/env python3
-
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Tests for validate_mocks.py script.
-"""
-
-import json
-from unittest.mock import MagicMock, patch
-
-import httpx
-import pytest
-from nemoguardrails.benchmark.validate_mocks import (
-    check_endpoint,
-    check_rails_endpoint,
-    main,
-)
-
-
-class TestCheckEndpoint:
-    """Tests for check_endpoint function."""
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_success(self, mock_get):
-        """Test successful health and model checks."""
-        # Mock health check response
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        # Mock models check response
-        models_response = MagicMock()
-        models_response.status_code = 200
-        models_response.json.return_value = {
-            "data": [
-                {"id": "meta/llama-3.3-70b-instruct"},
-                {"id": "other-model"},
-            ]
-        }
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "meta/llama-3.3-70b-instruct")
-
-        assert success
-        assert "PASSED" in summary
-        assert "8000" in summary
-        assert mock_get.call_count == 2
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_health_check_failed_status(self, mock_get):
-        """Test health check with non-200 status code."""
-        health_response = MagicMock()
-        health_response.status_code = 404
-
-        mock_get.return_value = health_response
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_health_check_unhealthy_status(self, mock_get):
-        """Test health check with unhealthy status."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "unhealthy"}
-
-        models_response = MagicMock()
-        models_response.status_code = 200
-        models_response.json.return_value = {"data": [{"id": "test-model"}]}
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_health_check_json_decode_error(self, mock_get):
-        """Test health check with invalid JSON."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.side_effect = json.JSONDecodeError("Expecting value", "", 0)
-
-        mock_get.return_value = health_response
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_health_connection_error(self, mock_get):
-        """Test health check with connection error."""
-        mock_get.side_effect = httpx.ConnectError("Connection failed")
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-        assert "Connection Error" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_health_timeout(self, mock_get):
-        """Test health check with timeout."""
-        mock_get.side_effect = httpx.TimeoutException("Request timed out")
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-        assert "Connection Timeout" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_check_failed_status(self, mock_get):
-        """Test model check with non-200 status code."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        models_response = MagicMock()
-        models_response.status_code = 404
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_not_found(self, mock_get):
-        """Test model check when expected model is not in the list."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        models_response = MagicMock()
-        models_response.status_code = 200
-        models_response.json.return_value = {
-            "data": [
-                {"id": "other-model-1"},
-                {"id": "other-model-2"},
-            ]
-        }
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_check_json_decode_error(self, mock_get):
-        """Test model check with invalid JSON."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        models_response = MagicMock()
-        models_response.status_code = 200
-        models_response.json.side_effect = json.JSONDecodeError("Expecting value", "", 0)
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_check_unexpected_json_structure(self, mock_get):
-        """Test model check with unexpected JSON structure."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        models_response = MagicMock()
-        models_response.status_code = 200
-        # Return invalid structure that will cause AttributeError
-        models_response.json.return_value = "invalid"
-
-        mock_get.side_effect = [health_response, models_response]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_check_connection_error(self, mock_get):
-        """Test model check with connection error."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        mock_get.side_effect = [
-            health_response,
-            httpx.ConnectError("Connection failed"),
-        ]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_endpoint_model_check_timeout(self, mock_get):
-        """Test model check with timeout."""
-        health_response = MagicMock()
-        health_response.status_code = 200
-        health_response.json.return_value = {"status": "healthy"}
-
-        mock_get.side_effect = [
-            health_response,
-            httpx.TimeoutException("Request timed out"),
-        ]
-
-        success, summary = check_endpoint(8000, "test-model")
-
-        assert not success
-        assert "FAILED" in summary
-
-
-class TestCheckRailsEndpoint:
-    """Tests for check_rails_endpoint function."""
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_success(self, mock_get):
-        """Test successful rails config check."""
-        response = MagicMock()
-        response.status_code = 200
-        response.json.return_value = [
-            {"id": "config1", "name": "Config 1"},
-            {"id": "config2", "name": "Config 2"},
-        ]
-
-        mock_get.return_value = response
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert success
-        assert "PASSED" in summary
-        assert "9000" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_non_200_status(self, mock_get):
-        """Test rails config check with non-200 status."""
-        response = MagicMock()
-        response.status_code = 404
-        response.json.return_value = []
-
-        mock_get.return_value = response
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_empty_list(self, mock_get):
-        """Test rails config check with empty list response."""
-        response = MagicMock()
-        response.status_code = 200
-        response.json.return_value = []
-
-        mock_get.return_value = response
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_not_a_list(self, mock_get):
-        """Test rails config check with non-list response."""
-        response = MagicMock()
-        response.status_code = 200
-        response.json.return_value = {"error": "invalid"}
-
-        mock_get.return_value = response
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_json_decode_error(self, mock_get):
-        """Test rails config check with invalid JSON."""
-        response = MagicMock()
-        response.status_code = 200
-        response.text = "invalid json"
-        response.json.side_effect = json.JSONDecodeError("Expecting value", "", 0)
-
-        mock_get.return_value = response
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_connection_error(self, mock_get):
-        """Test rails config check with connection error."""
-        mock_get.side_effect = httpx.ConnectError("Connection failed")
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-    @patch("nemoguardrails.benchmark.validate_mocks.httpx.get")
-    def test_check_rails_endpoint_timeout(self, mock_get):
-        """Test rails config check with timeout."""
-        mock_get.side_effect = httpx.TimeoutException("Request timed out")
-
-        success, summary = check_rails_endpoint(9000)
-
-        assert not success
-        assert "FAILED" in summary
-
-
-class TestMain:
-    """Tests for main function."""
-
-    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
-    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
-    def test_main_all_passed(self, mock_check_endpoint, mock_check_rails_endpoint):
-        """Test main function when all checks pass."""
-        mock_check_endpoint.side_effect = [
-            (True, "Port 8000 (meta/llama-3.3-70b-instruct): PASSED"),
-            (
-                True,
-                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
-            ),
-        ]
-        mock_check_rails_endpoint.return_value = (
-            True,
-            "Port 9000 (Rails Config): PASSED",
-        )
-
-        with pytest.raises(SystemExit) as exc_info:
-            main()
-
-        assert exc_info.value.code == 0
-        assert mock_check_endpoint.call_count == 2
-        assert mock_check_rails_endpoint.call_count == 1
-
-    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
-    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
-    def test_main_one_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
-        """Test main function when one check fails."""
-        mock_check_endpoint.side_effect = [
-            (False, "Port 8000 (meta/llama-3.3-70b-instruct): FAILED"),
-            (
-                True,
-                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
-            ),
-        ]
-        mock_check_rails_endpoint.return_value = (
-            True,
-            "Port 9000 (Rails Config): PASSED",
-        )
-
-        with pytest.raises(SystemExit) as exc_info:
-            main()
-
-        assert exc_info.value.code == 1
-
-    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
-    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
-    def test_main_all_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
-        """Test main function when all checks fail."""
-        mock_check_endpoint.side_effect = [
-            (False, "Port 8000 (meta/llama-3.3-70b-instruct): FAILED"),
-            (
-                False,
-                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): FAILED",
-            ),
-        ]
-        mock_check_rails_endpoint.return_value = (
-            False,
-            "Port 9000 (Rails Config): FAILED",
-        )
-
-        with pytest.raises(SystemExit) as exc_info:
-            main()
-
-        assert exc_info.value.code == 1
-
-    @patch("nemoguardrails.benchmark.validate_mocks.check_rails_endpoint")
-    @patch("nemoguardrails.benchmark.validate_mocks.check_endpoint")
-    def test_main_rails_failed(self, mock_check_endpoint, mock_check_rails_endpoint):
-        """Test main function when only rails check fails."""
-        mock_check_endpoint.side_effect = [
-            (True, "Port 8000 (meta/llama-3.3-70b-instruct): PASSED"),
-            (
-                True,
-                "Port 8001 (nvidia/llama-3.1-nemoguard-8b-content-safety): PASSED",
-            ),
-        ]
-        mock_check_rails_endpoint.return_value = (
-            False,
-            "Port 9000 (Rails Config): FAILED",
-        )
-
-        with pytest.raises(SystemExit) as exc_info:
-            main()
-
-        assert exc_info.value.code == 1
diff --git a/pytest.ini b/pytest.ini
index 6e29720ec..8816e5b62 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -13,3 +13,4 @@ asyncio_default_fixture_loop_scope = function
 testpaths =
     tests
     docs/colang-2/examples
+    benchmark/tests

From 0171c253591488501ac2556fee80953b9738a37e Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 12:53:51 -0600
Subject: [PATCH 06/12] Add requirements to keep benchmark dependencies
 separate from Guardrails itself

---
 benchmark/requirements.txt | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 benchmark/requirements.txt

diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
new file mode 100644
index 000000000..25afe3861
--- /dev/null
+++ b/benchmark/requirements.txt
@@ -0,0 +1,22 @@
+# Runtime dependencies for benchmark tools
+#
+# Install with: pip install -r requirements.txt
+#
+# Note: Version constraints are aligned with the main nemoguardrails package
+# where applicable to ensure compatibility.
+
+# --- general dependencies ---
+honcho>=2.0.0
+langchain-nvidia-ai-endpoints>=0.3.19
+
+# --- mock_llm_server dependencies ---
+fastapi>=0.103.0
+uvicorn>=0.23
+pydantic>=2.0
+pydantic-settings>=2.0
+numpy>=1.21
+
+# --- aiperf dependencies ---
+httpx>=0.24.1
+typer>=0.8
+pyyaml>=6.0

From 593e9e45a3d652ec8c9030296df7a1ab6ec1b39f Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 12:54:29 -0600
Subject: [PATCH 07/12] Update server run script and Procfile with new file
 locations

---
 benchmark/Procfile                      | 8 ++++----
 benchmark/mock_llm_server/run_server.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmark/Procfile b/benchmark/Procfile
index f177f52be..a5eb8fa37 100644
--- a/benchmark/Procfile
+++ b/benchmark/Procfile
@@ -1,8 +1,8 @@
 # Procfile
 
 # NeMo Guardrails server
-gr: poetry run nemoguardrails server --config configs/guardrail_configs --default-config-id content_safety_colang1 --port 9000
+gr: poetry run nemoguardrails server --config ../examples/configs/content_safety_local --default-config-id content_safety_local --port 9000
 
-# Guardrails NIMs for inference
-app_llm: poetry run python mock_llm_server/run_server.py --workers 4 --port 8000 --config-file configs/mock_configs/meta-llama-3.3-70b-instruct.env
-cs_llm: poetry run python mock_llm_server/run_server.py --workers 4 --port 8001 --config-file configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
+# Guardrails NIMs for inference. PYTHONPATH is set to the project root so absolute imports work
+app_llm: PYTHONPATH=.. python mock_llm_server/run_server.py --workers 4 --port 8000 --config-file mock_llm_server/configs/meta-llama-3.3-70b-instruct.env
+cs_llm: PYTHONPATH=.. python mock_llm_server/run_server.py --workers 4 --port 8001 --config-file mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env
diff --git a/benchmark/mock_llm_server/run_server.py b/benchmark/mock_llm_server/run_server.py
index c8b69ba9d..1ab3c3afe 100644
--- a/benchmark/mock_llm_server/run_server.py
+++ b/benchmark/mock_llm_server/run_server.py
@@ -101,7 +101,7 @@ def main():  # pragma: no cover
 
     try:
         uvicorn.run(
-            "nemoguardrails.benchmark.mock_llm_server.api:app",
+            "benchmark.mock_llm_server.api:app",
             host=args.host,
             port=args.port,
             reload=args.reload,

From a34f0367e8ee25933e510a4008a0209b7edbb2e2 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 13:41:24 -0600
Subject: [PATCH 08/12] Return np.array with size (1,) from mock function calls
 in tests

---
 benchmark/mock_llm_server/response_data.py | 2 +-
 benchmark/tests/test_mock_response_data.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/benchmark/mock_llm_server/response_data.py b/benchmark/mock_llm_server/response_data.py
index 3a9afe64f..27f035266 100644
--- a/benchmark/mock_llm_server/response_data.py
+++ b/benchmark/mock_llm_server/response_data.py
@@ -56,7 +56,7 @@ def get_latency_seconds(config: ModelSettings, seed: Optional[int] = None) -> fl
         a_min=config.latency_min_seconds,
         a_max=config.latency_max_seconds,
     )
-    return float(latency_seconds)
+    return float(latency_seconds[0])
 
 
 def is_unsafe(config: ModelSettings, seed: Optional[int] = None) -> bool:
diff --git a/benchmark/tests/test_mock_response_data.py b/benchmark/tests/test_mock_response_data.py
index 7b1716fcf..cd6704193 100644
--- a/benchmark/tests/test_mock_response_data.py
+++ b/benchmark/tests/test_mock_response_data.py
@@ -16,6 +16,7 @@
 import re
 from unittest.mock import MagicMock, patch
 
+import numpy as np
 import pytest
 
 from benchmark.mock_llm_server.config import ModelSettings
@@ -181,8 +182,8 @@ def test_get_response_unsafe(model_settings: ModelSettings):
 def test_get_latency_seconds_mocks_no_seed(mock_clip, mock_normal, mock_seed, model_settings: ModelSettings):
     """Check we call the correct numpy functions (not including seed)"""
 
-    mock_normal.return_value = model_settings.latency_mean_seconds
-    mock_clip.return_value = model_settings.latency_max_seconds
+    mock_normal.return_value = np.array([model_settings.latency_mean_seconds])
+    mock_clip.return_value = np.array([model_settings.latency_max_seconds])
 
     result = get_latency_seconds(model_settings)
 
@@ -208,8 +209,8 @@ def test_get_latency_seconds_mocks_with_seed(
 ):
     """Check we call the correct numpy functions (not including seed)"""
 
-    mock_normal.return_value = model_settings.latency_mean_seconds
-    mock_clip.return_value = model_settings.latency_max_seconds
+    mock_normal.return_value = np.array([model_settings.latency_mean_seconds])
+    mock_clip.return_value = np.array([model_settings.latency_max_seconds])
 
     result = get_latency_seconds(model_settings, seed=random_seed)
 

From 0142faf8cdccdd0410b6515b680420df9e7cc959 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 13:42:13 -0600
Subject: [PATCH 09/12] Remove langchain_nvidia_ai_endpoints from requirements,
 Guardrails already has this in the poetry env. It's not used in the mocks

---
 benchmark/requirements.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
index 25afe3861..a86e8ec3c 100644
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@@ -7,14 +7,13 @@
 
 # --- general dependencies ---
 honcho>=2.0.0
-langchain-nvidia-ai-endpoints>=0.3.19
 
 # --- mock_llm_server dependencies ---
 fastapi>=0.103.0
 uvicorn>=0.23
 pydantic>=2.0
 pydantic-settings>=2.0
-numpy>=1.21
+numpy>=2.3.2
 
 # --- aiperf dependencies ---
 httpx>=0.24.1

From 205db903ba9a020939875342f7f1cbdf57679313 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 13:42:34 -0600
Subject: [PATCH 10/12] Update README to match new file locations and include
 venv instructions

---
 benchmark/README.md | 65 ++++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 914d1b47c..1e7ebccf7 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -14,29 +14,40 @@ All models use the [Mock LLM Server](mock_llm_server), which is a simplified mod
 The aim of this benchmark is to detect performance-regressions as quickly as running unit-tests.
 
 ## Quickstart: Running Guardrails with Mock LLMs
+
 To run Guardrails with mocks for both the content-safety and main LLM, follow the steps below.
-All commands must be run in the `nemoguardrails/benchmark` directory.
-These assume you already have a working environment after following the steps in [CONTRIBUTING.md](../../CONTRIBUTING.md).
+All commands must be run in the `benchmark` directory.
+
+### 1. Set up benchmarking virtual environment
 
-First, we need to install the honcho and langchain-nvidia-ai-endpoints packages.
-The `honcho` package is used to run Procfile-based applications, and is a Python port of [Foreman](https://github.com/ddollar/foreman).
-The `langchain-nvidia-ai-endpoints` package is used to communicate with Mock LLMs via Langchain.
+The benchmarking tools have their own dependencies, which are managed using a virtual environment, pip, and the [requirements.txt](requirements.txt) file.
+In this section, you'll create a new virtual environment, activate it, and install all the dependencies needed to benchmark Guardrails.
+
+First you'll create the virtual environment and install dependencies.
 
 ```shell
-# Install dependencies
-$ poetry run pip install honcho langchain-nvidia-ai-endpoints
+# These commands must be run in the benchmark directory
+
+$ cd benchmark
+$ mkdir ~/env
+$ python -m venv ~/env/benchmark
+$ pip install -r requirements.txt
 ...
-Successfully installed filetype-1.2.0 honcho-2.0.0 langchain-nvidia-ai-endpoints-0.3.19
+Successfully installed fastapi-0.128.0 honcho-2.0.0 httpx-0.28.1 langchain-core-1.2.5 langchain-nvidia-ai-endpoints-1.0.0 numpy-2.4.0 pydantic-2.12.5 pydantic-core-2.41.5 pydantic-settings-2.12.0 pyyaml-6.0.3 typer-0.21.0 typing-inspection-0.4.2 uuid-utils-0.12.0 uvicorn-0.40.0
+$ source ~/env/benchmark/bin/activate
 ```
 
+### 2. Run Guardrails with Mock LLMs for Content-Safety and Application LLM
+
 Now we can start up the processes that are part of the [Procfile](Procfile).
 As the Procfile processes spin up, they log to the console with a prefix. The `system` prefix is used by Honcho, `app_llm` is the Application or Main LLM mock, `cs_llm` is the content-safety mock, and `gr` is the Guardrails service. We'll explore the Procfile in more detail below.
 Once the three 'Uvicorn running on ...' messages are printed, you can move to the next step. Note these messages are likely not on consecutive lines.
 
-```
-# All commands must be run in the nemoguardrails/benchmark directory
-$ cd nemoguardrails/benchmark
-$ poetry run honcho start
+```shell
+# These commands must be run in the benchmark directory
+
+(benchmark) $ cd benchmark
+(benchmark) $ poetry run honcho start
 13:40:33 system    | gr.1 started (pid=93634)
 13:40:33 system    | app_llm.1 started (pid=93635)
 13:40:33 system    | cs_llm.1 started (pid=93636)
@@ -48,34 +59,33 @@ $ poetry run honcho start
 13:40:45 gr.1      | INFO:     Uvicorn running on http://0.0.0.0:9000 (Press CTRL+C to quit)
 ```
 
-Once Guardrails and the mock servers are up, we can use the `validate_mocks.py` script to check they're healthy and serving the correct models.
+### 3. Validate services are running correctly
+
+Once Guardrails and the mock servers are up, we can use the [validate_mocks.sh](scripts/validate_mocks.sh) script to check all services are healthy and serving the expected model names.
 
 ```shell
-$ cd nemoguardrails/benchmark
-$ poetry run python validate_mocks.py
+# These commands must be run in the benchmark directory
+
+(benchmark) $ cd nemoguardrails/benchmark
+(benchmark) $ scripts/validate_mocks.sh
 Starting LLM endpoint health check...
 
 --- Checking Port: 8000 ---
 Checking http://localhost:8000/health ...
-HTTP Request: GET http://localhost:8000/health "HTTP/1.1 200 OK"
 Health Check PASSED: Status is 'healthy'.
 Checking http://localhost:8000/v1/models for 'meta/llama-3.3-70b-instruct'...
-HTTP Request: GET http://localhost:8000/v1/models "HTTP/1.1 200 OK"
 Model Check PASSED: Found 'meta/llama-3.3-70b-instruct' in model list.
 --- Port 8000: ALL CHECKS PASSED ---
 
 --- Checking Port: 8001 ---
 Checking http://localhost:8001/health ...
-HTTP Request: GET http://localhost:8001/health "HTTP/1.1 200 OK"
 Health Check PASSED: Status is 'healthy'.
 Checking http://localhost:8001/v1/models for 'nvidia/llama-3.1-nemoguard-8b-content-safety'...
-HTTP Request: GET http://localhost:8001/v1/models "HTTP/1.1 200 OK"
 Model Check PASSED: Found 'nvidia/llama-3.1-nemoguard-8b-content-safety' in model list.
 --- Port 8001: ALL CHECKS PASSED ---
 
 --- Checking Port: 9000 (Rails Config) ---
 Checking http://localhost:9000/v1/rails/configs ...
-HTTP Request: GET http://localhost:9000/v1/rails/configs "HTTP/1.1 200 OK"
 HTTP Status PASSED: Got 200.
 Body Check PASSED: Response is an array with at least one entry.
 --- Port 9000: ALL CHECKS PASSED ---
@@ -88,10 +98,12 @@ Port 9000 (Rails Config): PASSED
 Overall Status: All endpoints are healthy!
 ```
 
+### 4. Make Guardrails requests
+
 Once the mocks and Guardrails are running and the script passes, we can issue curl requests against the Guardrails `/chat/completions` endpoint to generate a response and test the system end-to-end.
 
 ```shell
-curl -s -X POST http://0.0.0.0:9000/v1/chat/completions \
+ $ curl -s -X POST http://0.0.0.0:9000/v1/chat/completions \
    -H 'Accept: application/json' \
    -H 'Content-Type: application/json' \
    -d '{
@@ -112,7 +124,6 @@ curl -s -X POST http://0.0.0.0:9000/v1/chat/completions \
     }
   ]
 }
-
 ```
 
 ------
@@ -125,18 +136,20 @@ In this section, we'll examine the configuration files used in the quickstart ab
 
 The [Procfile](Procfile?raw=true) contains all the processes that make up the application.
 The Honcho package reads in this file, starts all the processes, and combines their logs to the console
-The `gr` line runs the Guardrails server on port 9000 and sets the default Guardrails configuration as [content_safety_colang1](configs/guardrail_configs/content_safety_colang1?raw=true).
+The `gr` line runs the Guardrails server on port 9000 and sets the default Guardrails configuration as [content_safety_colang1](../examples/configs/content_safety_local?raw=true).
 The `app_llm` line runs the Application or Main Mock LLM. Guardrails calls this LLM to generate a response to the user's query. This server uses 4 uvicorn workers and runs on port 8000. The configuration file here is a Mock LLM configuration, not a Guardrails configuration.
 The `cs_llm` line runs the Content-Safety Mock LLM. This uses 4 uvicorn workers and runs on port 8001.
 
 ### Guardrails Configuration
-The [Guardrails Configuration](configs/guardrail_configs/content_safety_colang1/config.yml) is used by the Guardrails server.
+
+The [Guardrails Configuration](../examples/configs/content_safety_local/config.yml?raw=true) is used by the Guardrails server.
 Under the `models` section, the `main` model is used to generate responses to the user queries. The base URL for this model is the `app_llm` Mock LLM from the Procfile, running on port 8000. The `model` field has to match the Mock LLM model name.
 The `content_safety` model is configured for use in an input and output rail. The `type` field matches the `$model` used in the input and output flows.
 
 ### Mock LLM Endpoints
+
 The Mock LLM implements a subset of the OpenAI LLM API.
-There are two Mock LLM configurations, one for the Mock [main model](configs/mock_configs/meta-llama-3.3-70b-instruct.env), and another for the Mock [content-safety](configs/mock_configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env) model.
+There are two Mock LLM configurations, one for the Mock [main model](mock_llm_server/configs/meta-llama-3.3-70b-instruct.env?raw=true), and another for the Mock [content-safety](mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env?raw=true) model.
 The Mock LLM has the following OpenAI-compatible endpoints:
 
 * `/health`: Returns a JSON object with status set to healthy and timestamp in seconds-since-epoch. For example `{"status":"healthy","timestamp":1762781239}`
@@ -145,6 +158,7 @@ The Mock LLM has the following OpenAI-compatible endpoints:
 * `/v1/chat/completions`: Returns an [OpenAI chat completion object](https://platform.openai.com/docs/api-reference/chat/object) using the Mock configuration (see below).
 
 ### Mock LLM Configuration
+
 Mock LLMs are configured using the `.env` file format. These files are passed to the Mock LLM using the `--config-file` argument.
 The Mock LLMs return either a `SAFE_TEXT` or `UNSAFE_TEXT` response to `/v1/completions` or `/v1/chat/completions` inference requests.
 The probability of the `UNSAFE_TEXT` being returned if given by `UNSAFE_PROBABILITY`.
@@ -155,6 +169,7 @@ The latency of each response is also controllable, and works as follows:
 * If the sampled value is less than `LATENCY_MAX_SECONDS`, it is set to `LATENCY_MAX_SECONDS`.
 
 The full list of configuration fields is shown below:
+
 * `MODEL`: The Model name served by the Mock LLM. This will be returned on the `/v1/models` endpoint.
 * `UNSAFE_PROBABILITY`: Probability of an unsafe response. This is a probability, and must be in the range [0, 1].
 * `UNSAFE_TEXT`: String returned as an unsafe response.

From 53810b0b80a05c4c2bc5d82d3d71a9538b027f0f Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Wed, 31 Dec 2025 13:56:01 -0600
Subject: [PATCH 11/12] Cleanups to the README

---
 benchmark/README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 1e7ebccf7..7e58a17c0 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -33,8 +33,9 @@ $ mkdir ~/env
 $ python -m venv ~/env/benchmark
 $ pip install -r requirements.txt
 ...
-Successfully installed fastapi-0.128.0 honcho-2.0.0 httpx-0.28.1 langchain-core-1.2.5 langchain-nvidia-ai-endpoints-1.0.0 numpy-2.4.0 pydantic-2.12.5 pydantic-core-2.41.5 pydantic-settings-2.12.0 pyyaml-6.0.3 typer-0.21.0 typing-inspection-0.4.2 uuid-utils-0.12.0 uvicorn-0.40.0
+Successfully installed fastapi-0.128.0 honcho-2.0.0 httpx-0.28.1 langchain-core-1.2.5 numpy-2.4.0 pydantic-2.12.5 pydantic-core-2.41.5 pydantic-settings-2.12.0 pyyaml-6.0.3 typer-0.21.0 typing-inspection-0.4.2 uuid-utils-0.12.0 uvicorn-0.40.0
 $ source ~/env/benchmark/bin/activate
+(benchmark) $
 ```
 
 ### 2. Run Guardrails with Mock LLMs for Content-Safety and Application LLM
@@ -44,7 +45,7 @@ As the Procfile processes spin up, they log to the console with a prefix. The `s
 Once the three 'Uvicorn running on ...' messages are printed, you can move to the next step. Note these messages are likely not on consecutive lines.
 
 ```shell
-# These commands must be run in the benchmark directory
+# These commands must be run in the benchmark directory after activating the virtual environment
 
 (benchmark) $ cd benchmark
 (benchmark) $ poetry run honcho start
@@ -64,9 +65,9 @@ Once the three 'Uvicorn running on ...' messages are printed, you can move to th
 Once Guardrails and the mock servers are up, we can use the [validate_mocks.sh](scripts/validate_mocks.sh) script to check all services are healthy and serving the expected model names.
 
 ```shell
-# These commands must be run in the benchmark directory
+# These commands must be run in the benchmark directory after activating the virtual environment
 
-(benchmark) $ cd nemoguardrails/benchmark
+(benchmark) $ cd benchmark
 (benchmark) $ scripts/validate_mocks.sh
 Starting LLM endpoint health check...
 
@@ -116,6 +117,7 @@ Once the mocks and Guardrails are running and the script passes, we can issue cu
       ],
       "stream": false
     }' | jq
+
 {
   "messages": [
     {
@@ -134,22 +136,22 @@ In this section, we'll examine the configuration files used in the quickstart ab
 
 ### Procfile
 
-The [Procfile](Procfile?raw=true) contains all the processes that make up the application.
+The [Procfile](Procfile) contains all the processes that make up the application.
 The Honcho package reads in this file, starts all the processes, and combines their logs to the console
-The `gr` line runs the Guardrails server on port 9000 and sets the default Guardrails configuration as [content_safety_colang1](../examples/configs/content_safety_local?raw=true).
+The `gr` line runs the Guardrails server on port 9000 and sets the default Guardrails configuration as [content_safety_local](../examples/configs/content_safety_local).
 The `app_llm` line runs the Application or Main Mock LLM. Guardrails calls this LLM to generate a response to the user's query. This server uses 4 uvicorn workers and runs on port 8000. The configuration file here is a Mock LLM configuration, not a Guardrails configuration.
 The `cs_llm` line runs the Content-Safety Mock LLM. This uses 4 uvicorn workers and runs on port 8001.
 
 ### Guardrails Configuration
 
-The [Guardrails Configuration](../examples/configs/content_safety_local/config.yml?raw=true) is used by the Guardrails server.
+The [Guardrails Configuration](../examples/configs/content_safety_local/config.yml) is used by the Guardrails server.
 Under the `models` section, the `main` model is used to generate responses to the user queries. The base URL for this model is the `app_llm` Mock LLM from the Procfile, running on port 8000. The `model` field has to match the Mock LLM model name.
 The `content_safety` model is configured for use in an input and output rail. The `type` field matches the `$model` used in the input and output flows.
 
 ### Mock LLM Endpoints
 
 The Mock LLM implements a subset of the OpenAI LLM API.
-There are two Mock LLM configurations, one for the Mock [main model](mock_llm_server/configs/meta-llama-3.3-70b-instruct.env?raw=true), and another for the Mock [content-safety](mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env?raw=true) model.
+There are two Mock LLM configurations, one for the Mock [main model](mock_llm_server/configs/meta-llama-3.3-70b-instruct.env), and another for the Mock [content-safety](mock_llm_server/configs/nvidia-llama-3.1-nemoguard-8b-content-safety.env) model.
 The Mock LLM has the following OpenAI-compatible endpoints:
 
 * `/health`: Returns a JSON object with status set to healthy and timestamp in seconds-since-epoch. For example `{"status":"healthy","timestamp":1762781239}`

From f74addbf053b6d142fc369ed4f645c14078ef096 Mon Sep 17 00:00:00 2001
From: tgasser-nv <200644301+tgasser-nv@users.noreply.github.com>
Date: Mon, 5 Jan 2026 10:36:07 -0600
Subject: [PATCH 12/12] README.md cleanup

---
 benchmark/README.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmark/README.md b/benchmark/README.md
index 7e58a17c0..37f21fef5 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -26,16 +26,16 @@ In this section, you'll create a new virtual environment, activate it, and insta
 First you'll create the virtual environment and install dependencies.
 
 ```shell
-# These commands must be run in the benchmark directory
+# Create a virtual environment under ~/env/benchmark_env and activate it
 
 $ cd benchmark
 $ mkdir ~/env
-$ python -m venv ~/env/benchmark
+$ python -m venv ~/env/benchmark_env
 $ pip install -r requirements.txt
 ...
 Successfully installed fastapi-0.128.0 honcho-2.0.0 httpx-0.28.1 langchain-core-1.2.5 numpy-2.4.0 pydantic-2.12.5 pydantic-core-2.41.5 pydantic-settings-2.12.0 pyyaml-6.0.3 typer-0.21.0 typing-inspection-0.4.2 uuid-utils-0.12.0 uvicorn-0.40.0
-$ source ~/env/benchmark/bin/activate
-(benchmark) $
+$ source ~/env/benchmark_env/bin/activate
+(benchmark_env) $
 ```
 
 ### 2. Run Guardrails with Mock LLMs for Content-Safety and Application LLM
@@ -45,10 +45,9 @@ As the Procfile processes spin up, they log to the console with a prefix. The `s
 Once the three 'Uvicorn running on ...' messages are printed, you can move to the next step. Note these messages are likely not on consecutive lines.
 
 ```shell
-# These commands must be run in the benchmark directory after activating the virtual environment
+# These commands must be run in the benchmark directory after activating the benchmark_env virtual environment
 
-(benchmark) $ cd benchmark
-(benchmark) $ poetry run honcho start
+(benchmark_env) $ honcho start
 13:40:33 system    | gr.1 started (pid=93634)
 13:40:33 system    | app_llm.1 started (pid=93635)
 13:40:33 system    | cs_llm.1 started (pid=93636)
@@ -62,13 +61,14 @@ Once the three 'Uvicorn running on ...' messages are printed, you can move to th
 
 ### 3. Validate services are running correctly
 
-Once Guardrails and the mock servers are up, we can use the [validate_mocks.sh](scripts/validate_mocks.sh) script to check all services are healthy and serving the expected model names.
+Once Guardrails and the mock servers are up, we'll use the [validate_mocks.sh](scripts/validate_mocks.sh) script to validate everything is working.
+This doesn't require the `benchmark_env` virtual environment since we're running curl commands in the script.
 
 ```shell
-# These commands must be run in the benchmark directory after activating the virtual environment
+# In a new shell, change into the benchmark directory and run these commands.
 
-(benchmark) $ cd benchmark
-(benchmark) $ scripts/validate_mocks.sh
+$ cd benchmark
+$ scripts/validate_mocks.sh
 Starting LLM endpoint health check...
 
 --- Checking Port: 8000 ---
@@ -173,7 +173,7 @@ The latency of each response is also controllable, and works as follows:
 The full list of configuration fields is shown below:
 
 * `MODEL`: The Model name served by the Mock LLM. This will be returned on the `/v1/models` endpoint.
-* `UNSAFE_PROBABILITY`: Probability of an unsafe response. This is a probability, and must be in the range [0, 1].
+* `UNSAFE_PROBABILITY`: Probability of an unsafe response. This must be in the range [0, 1].
 * `UNSAFE_TEXT`: String returned as an unsafe response.
 * `SAFE_TEXT`: String returned as a safe response.
 * `LATENCY_MIN_SECONDS`: Minimum latency in seconds.