From 369397b8445197604dbc2b7ed9cdae20f2879d8b Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 2 Apr 2026 16:09:06 +0100 Subject: [PATCH 1/5] fix: address CodeRabbit AI review comments Critical and major fixes: - SSH tunnel cleanup: use pgrep instead of lsof for safer process targeting - Results path: search for test-metadata.json to extract core_config_name - Prometheus metrics: apply default filter before multiplication - Metrics collector: use dynamic timeout based on test duration - Setup script: add error handling for cd command - ITL metrics: fix metric name mismatch in Server_Metrics.py dashboard - Grafana dashboard: replace hardcoded datasource UID with template variable Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .../ansible/llm-benchmark-auto.yml | 4 +++- .../ansible/publish-existing-results.yml | 18 +++++++++++++++++- .../templates/prometheus_metrics.j2 | 8 ++++---- .../vllm_metrics_collector/tasks/stop.yml | 9 +++++++++ .../test-execution/dashboard-examples/setup.sh | 2 +- ...\237\226\245\357\270\217_Server_Metrics.py" | 4 ++-- .../grafana/dashboards/vllm_comprehensive.json | 6 +++--- 7 files changed, 39 insertions(+), 12 deletions(-) diff --git a/automation/test-execution/ansible/llm-benchmark-auto.yml b/automation/test-execution/ansible/llm-benchmark-auto.yml index af8fb5ff..a75be5e5 100644 --- a/automation/test-execution/ansible/llm-benchmark-auto.yml +++ b/automation/test-execution/ansible/llm-benchmark-auto.yml @@ -486,6 +486,7 @@ tasks_from: stop vars: results_path: "{{ playbook_dir }}/../../../results/llm/{{ test_model | replace('/', '__') }}/{{ workload_type }}-{{ hostvars['localhost']['test_run_id'] }}/{{ core_configuration.name }}" + metrics_collection_duration: "{{ hostvars['localhost']['estimated_test_duration'] | default(60) }}" enable_vllm_metrics_collection: true delegate_to: localhost become: false @@ -723,7 +724,8 @@ - block: - name: Get tunnel PID ansible.builtin.shell: | - lsof -ti :8000 -sTCP:LISTEN | head -1 + # Only kill SSH tunnel processes, not vLLM server or other services + pgrep -f "ssh.*-L.*8000:localhost:8000" | head -1 register: tunnel_pid_check failed_when: false changed_when: false diff --git a/automation/test-execution/ansible/publish-existing-results.yml b/automation/test-execution/ansible/publish-existing-results.yml index e02177ef..48515684 100644 --- a/automation/test-execution/ansible/publish-existing-results.yml +++ b/automation/test-execution/ansible/publish-existing-results.yml @@ -16,17 +16,33 @@ - requested_cores is defined fail_msg: "Missing required variables. Provide: test_model, workload_type, test_run_id, requested_cores" + - name: Search for test-metadata.json to find core_config_name + ansible.builtin.find: + paths: "{{ playbook_dir }}/../../../results/llm/{{ test_model | replace('/', '__') }}/{{ workload_type }}-{{ test_run_id }}" + patterns: "test-metadata.json" + recurse: true + register: metadata_search + when: core_config_name is not defined + + - name: Load core_config_name from metadata if found + ansible.builtin.set_fact: + core_config_name: "{{ (lookup('file', metadata_search.files[0].path) | from_json).core_config_name }}" + when: + - core_config_name is not defined + - metadata_search.matched > 0 + - name: Set core configuration if not provided ansible.builtin.set_fact: core_configuration: cores: "{{ requested_cores }}" tensor_parallel: 1 numa_node: 0 + name: "{{ core_config_name | default(requested_cores | string + 'cores-numa0-tp1') }}" when: core_configuration is not defined - name: Build results path ansible.builtin.set_fact: - results_path: "{{ playbook_dir }}/../../../results/llm/{{ test_model | replace('/', '__') }}/{{ workload_type }}-{{ test_run_id }}/{{ requested_cores }}cores-numa{{ core_configuration.numa_node | default(0) }}-tp{{ core_configuration.tensor_parallel | default(1) }}" + results_path: "{{ playbook_dir }}/../../../results/llm/{{ test_model | replace('/', '__') }}/{{ workload_type }}-{{ test_run_id }}/{{ core_configuration.name | default(core_config_name) | default(requested_cores | string + 'cores-numa' + (core_configuration.numa_node | default(0) | string) + '-tp' + (core_configuration.tensor_parallel | default(1) | string)) }}" - name: Display publishing configuration ansible.builtin.debug: diff --git a/automation/test-execution/ansible/roles/metrics_publisher/templates/prometheus_metrics.j2 b/automation/test-execution/ansible/roles/metrics_publisher/templates/prometheus_metrics.j2 index 11187631..727143f3 100644 --- a/automation/test-execution/ansible/roles/metrics_publisher/templates/prometheus_metrics.j2 +++ b/automation/test-execution/ansible/roles/metrics_publisher/templates/prometheus_metrics.j2 @@ -31,10 +31,10 @@ guidellm_itl_ms_p95{ {{ labels }} } {{ benchmark.metrics.inter_token_latency_ms. guidellm_itl_ms_p99{ {{ labels }} } {{ benchmark.metrics.inter_token_latency_ms.successful.percentiles.p99 | default(0) }} # End-to-End Request Latency in milliseconds (convert from seconds) -guidellm_e2e_latency_ms_mean{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.mean * 1000) | default(0) }} -guidellm_e2e_latency_ms_p50{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p50 * 1000) | default(0) }} -guidellm_e2e_latency_ms_p95{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p95 * 1000) | default(0) }} -guidellm_e2e_latency_ms_p99{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p99 * 1000) | default(0) }} +guidellm_e2e_latency_ms_mean{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.mean | default(0)) * 1000 }} +guidellm_e2e_latency_ms_p50{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p50 | default(0)) * 1000 }} +guidellm_e2e_latency_ms_p95{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p95 | default(0)) * 1000 }} +guidellm_e2e_latency_ms_p99{ {{ labels }} } {{ (benchmark.metrics.request_latency.successful.percentiles.p99 | default(0)) * 1000 }} # Request Statistics guidellm_total_requests{ {{ labels }} } {{ benchmark.metrics.request_totals.total | default(0) }} diff --git a/automation/test-execution/ansible/roles/vllm_metrics_collector/tasks/stop.yml b/automation/test-execution/ansible/roles/vllm_metrics_collector/tasks/stop.yml index 63a25a51..d8326a7a 100644 --- a/automation/test-execution/ansible/roles/vllm_metrics_collector/tasks/stop.yml +++ b/automation/test-execution/ansible/roles/vllm_metrics_collector/tasks/stop.yml @@ -1,6 +1,15 @@ --- # Stop vLLM Metrics Collector +- name: Wait for metrics collection to complete + ansible.builtin.wait_for: + path: "{{ results_path }}/vllm-metrics.json" + timeout: "{{ (metrics_collection_duration | default(60) | int) + 30 }}" + delegate_to: localhost + become: false + ignore_errors: true + when: enable_vllm_metrics_collection | default(true) + - name: Send SIGTERM to metrics collector for graceful shutdown ansible.builtin.shell: | if ps -p {{ vllm_metrics_collector_pid }} > /dev/null 2>&1; then diff --git a/automation/test-execution/dashboard-examples/setup.sh b/automation/test-execution/dashboard-examples/setup.sh index cff972db..c63eeb9e 100755 --- a/automation/test-execution/dashboard-examples/setup.sh +++ b/automation/test-execution/dashboard-examples/setup.sh @@ -3,7 +3,7 @@ # Safe to run multiple times SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$SCRIPT_DIR" +cd "$SCRIPT_DIR" || { echo "Error: Cannot change to script directory"; exit 1; } echo "Setting up Python virtual environment for dashboards..." echo "" diff --git "a/automation/test-execution/dashboard-examples/vllm_dashboard/pages/2_\360\237\226\245\357\270\217_Server_Metrics.py" "b/automation/test-execution/dashboard-examples/vllm_dashboard/pages/2_\360\237\226\245\357\270\217_Server_Metrics.py" index 49f7ed33..9b872658 100644 --- "a/automation/test-execution/dashboard-examples/vllm_dashboard/pages/2_\360\237\226\245\357\270\217_Server_Metrics.py" +++ "b/automation/test-execution/dashboard-examples/vllm_dashboard/pages/2_\360\237\226\245\357\270\217_Server_Metrics.py" @@ -384,8 +384,8 @@ def mean_metric(sample: Dict, metric_name: str) -> float: itl_sum = sum_metric(s, 'vllm:request_time_per_output_token_seconds_sum') itl_count = sum_metric(s, 'vllm:request_time_per_output_token_seconds_count') if i > 0 and itl_count > 0: - prev_sum = sum_metric(samples[i-1], 'vllm:time_per_output_token_seconds_sum') - prev_count = sum_metric(samples[i-1], 'vllm:time_per_output_token_seconds_count') + prev_sum = sum_metric(samples[i-1], 'vllm:request_time_per_output_token_seconds_sum') + prev_count = sum_metric(samples[i-1], 'vllm:request_time_per_output_token_seconds_count') delta_sum = itl_sum - prev_sum delta_count = itl_count - prev_count itl_latency.append((delta_sum / delta_count * 1000) if delta_count > 0 else 0) # Convert to ms diff --git a/automation/test-execution/grafana/dashboards/vllm_comprehensive.json b/automation/test-execution/grafana/dashboards/vllm_comprehensive.json index 3deeb0bc..b58882c9 100644 --- a/automation/test-execution/grafana/dashboards/vllm_comprehensive.json +++ b/automation/test-execution/grafana/dashboards/vllm_comprehensive.json @@ -1247,7 +1247,7 @@ { "datasource": { "type": "prometheus", - "uid": "edx8memhpd9tsa" + "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", @@ -1347,7 +1347,7 @@ { "datasource": { "type": "prometheus", - "uid": "edx8memhpd9tsa" + "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", @@ -1460,7 +1460,7 @@ { "datasource": { "type": "prometheus", - "uid": "edx8memhpd9tsa" + "uid": "${DS_PROMETHEUS}" }, "disableTextWrap": false, "editorMode": "code", From f368a9d5a6242be6f3f56bae0a2f8590cb729235 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 2 Apr 2026 16:15:33 +0100 Subject: [PATCH 2/5] fix: remove hardcoded values and fix cleanup script paths - Remove hardcoded EC2 hostnames and SSH key path from setup-tunnels.sh - Fix cleanup-collector-scripts.sh to search /tmp instead of results/llm - Update pattern from collect_vllm_metrics.py to collect_vllm_metrics_*.py These were security concerns flagged by CodeRabbit AI review. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .../grafana/scripts/setup-tunnels.sh | 4 ++-- .../utilities/cleanup-collector-scripts.sh | 24 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/automation/test-execution/grafana/scripts/setup-tunnels.sh b/automation/test-execution/grafana/scripts/setup-tunnels.sh index d34eff0e..0eab73e7 100755 --- a/automation/test-execution/grafana/scripts/setup-tunnels.sh +++ b/automation/test-execution/grafana/scripts/setup-tunnels.sh @@ -65,7 +65,7 @@ setup_pushgateway_tunnel() { echo " LOADGEN localhost:${PUSHGATEWAY_PORT} → Your machine localhost:${PUSHGATEWAY_PORT}" # Kill existing tunnel if any - pkill -f "ssh.*${LOADGEN_HOSTNAME}.*${PUSHGATEWAY_PORT}:localhost:${PUSHGATEWAY_PORT}" 2>/dev/null || true + pkill -f "ssh.*${PUSHGATEWAY_PORT}:localhost:${PUSHGATEWAY_PORT}.*${LOADGEN_HOSTNAME}" 2>/dev/null || true # Create reverse tunnel in background ssh -i "${ANSIBLE_SSH_KEY}" \ @@ -79,7 +79,7 @@ setup_pushgateway_tunnel() { sleep 2 # Verify tunnel - if pgrep -f "ssh.*${LOADGEN_HOSTNAME}.*${PUSHGATEWAY_PORT}:localhost:${PUSHGATEWAY_PORT}" > /dev/null; then + if pgrep -f "ssh.*${PUSHGATEWAY_PORT}:localhost:${PUSHGATEWAY_PORT}.*${LOADGEN_HOSTNAME}" > /dev/null; then echo -e "${GREEN}✓ Pushgateway reverse tunnel established${NC}" echo " Ansible on LOADGEN can now push to: http://localhost:${PUSHGATEWAY_PORT}" else diff --git a/automation/utilities/cleanup-collector-scripts.sh b/automation/utilities/cleanup-collector-scripts.sh index 3409aeb9..5c4474dd 100755 --- a/automation/utilities/cleanup-collector-scripts.sh +++ b/automation/utilities/cleanup-collector-scripts.sh @@ -1,36 +1,36 @@ #!/bin/bash -# Cleanup script to remove orphaned collect_vllm_metrics.py files from results directories -# These files were created by an older version of the vllm_metrics_collector role +# Cleanup script to remove orphaned collect_vllm_metrics_*.py files from /tmp +# These files are created by the vllm_metrics_collector role and may persist if tests are interrupted -RESULTS_DIR="${1:-results/llm}" +TEMP_DIR="${1:-/tmp}" -if [ ! -d "$RESULTS_DIR" ]; then - echo "Error: Results directory not found: $RESULTS_DIR" - echo "Usage: $0 [results_directory]" +if [ ! -d "$TEMP_DIR" ]; then + echo "Error: Directory not found: $TEMP_DIR" + echo "Usage: $0 [temp_directory]" exit 1 fi -echo "Cleaning up collect_vllm_metrics.py files from results directories..." -echo "Searching in: $RESULTS_DIR" +echo "Cleaning up collect_vllm_metrics_*.py files from temporary directory..." +echo "Searching in: $TEMP_DIR" echo # Find and count scripts -SCRIPT_COUNT=$(find "$RESULTS_DIR" -name "collect_vllm_metrics.py" -type f | wc -l) +SCRIPT_COUNT=$(find "$TEMP_DIR" -maxdepth 1 -name "collect_vllm_metrics_*.py" -type f | wc -l) if [ "$SCRIPT_COUNT" -eq 0 ]; then - echo "✓ No orphaned scripts found. Results directories are clean!" + echo "✓ No orphaned scripts found. Temporary directory is clean!" exit 0 fi echo "Found $SCRIPT_COUNT orphaned script(s):" -find "$RESULTS_DIR" -name "collect_vllm_metrics.py" -type f +find "$TEMP_DIR" -maxdepth 1 -name "collect_vllm_metrics_*.py" -type f echo read -p "Delete these files? (y/N): " -n 1 -r echo if [[ $REPLY =~ ^[Yy]$ ]]; then - find "$RESULTS_DIR" -name "collect_vllm_metrics.py" -type f -delete + find "$TEMP_DIR" -maxdepth 1 -name "collect_vllm_metrics_*.py" -type f -delete echo "✓ Deleted $SCRIPT_COUNT file(s)" # Calculate space saved (rough estimate: ~5KB per script) From 492bda906169f2f5af07436d63a04c681221fe7c Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 2 Apr 2026 16:33:26 +0100 Subject: [PATCH 3/5] fix: improve Ansible playbook portability and documentation accuracy - Add Docker Compose CLI v2 (docker compose) detection to start/stop-grafana playbooks - Replace Linux-only port probing (ss/netstat) with portable wait_for in start-grafana - Fix stop-grafana to use OS-specific compose files matching start-grafana behavior - Make datasource version bump deterministic instead of random - Enhance SSH tunnel detection to include hostname for precise matching - Improve container log collection to detect more failure scenarios - Fix dashboard-examples README contradictions about vllm-metrics.json source - Update default results path from ../../../../../results/llm to ../../../../results/llm - Make grafana README Prometheus config OS-agnostic with platform-specific notes Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .../test-execution/ansible/start-grafana.yml | 67 ++++++++++++------- .../test-execution/ansible/stop-grafana.yml | 25 +++++-- .../dashboard-examples/README.md | 8 +-- automation/test-execution/grafana/README.md | 7 +- 4 files changed, 74 insertions(+), 33 deletions(-) diff --git a/automation/test-execution/ansible/start-grafana.yml b/automation/test-execution/ansible/start-grafana.yml index c2758fd9..ad191be2 100644 --- a/automation/test-execution/ansible/start-grafana.yml +++ b/automation/test-execution/ansible/start-grafana.yml @@ -66,6 +66,12 @@ failed_when: false changed_when: false + - name: Check for docker compose (CLI v2) + ansible.builtin.command: docker compose version + register: docker_cli_compose_check + failed_when: false + changed_when: false + - name: Check for podman-compose ansible.builtin.command: podman-compose --version register: podman_compose_check @@ -78,24 +84,34 @@ runtime_name: Docker when: docker_compose_check.rc == 0 + - name: Set compose command (docker compose CLI v2) + ansible.builtin.set_fact: + compose_cmd: docker compose + runtime_name: Docker + when: + - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc == 0 + - name: Set compose command (podman-compose) ansible.builtin.set_fact: compose_cmd: podman-compose runtime_name: Podman when: - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc != 0 - podman_compose_check.rc == 0 - name: Fail if no compose tool found ansible.builtin.fail: msg: | - Neither docker-compose nor podman-compose found! + Neither docker-compose, docker compose, nor podman-compose found! Install one of the following: - - Docker Desktop (includes docker-compose) + - Docker Desktop (includes docker-compose or docker compose) - podman-compose: pip3 install podman-compose when: - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc != 0 - podman_compose_check.rc != 0 when: compose_cmd_override == "" @@ -144,52 +160,48 @@ when: prometheus_port_env != "" - name: Check if preferred Prometheus port is in use - ansible.builtin.shell: | - ss -tuln | grep -E ':{{ prometheus_port_preferred }}\s' || netstat -tuln 2>/dev/null | grep -E ':{{ prometheus_port_preferred }}\s' || echo "available" + ansible.builtin.wait_for: + port: "{{ prometheus_port_preferred }}" + state: stopped + timeout: 1 register: preferred_port_check - changed_when: false failed_when: false when: prometheus_port_env == "" - name: Check if fallback Prometheus port is in use - ansible.builtin.shell: | - ss -tuln | grep -E ':{{ prometheus_port_fallback }}\s' || netstat -tuln 2>/dev/null | grep -E ':{{ prometheus_port_fallback }}\s' || echo "available" + ansible.builtin.wait_for: + port: "{{ prometheus_port_fallback }}" + state: stopped + timeout: 1 register: fallback_port_check - changed_when: false failed_when: false - when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout + when: prometheus_port_env == "" and preferred_port_check.failed - name: Set Prometheus port to preferred ansible.builtin.set_fact: prometheus_port: "{{ prometheus_port_preferred }}" - when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" in preferred_port_check.stdout + when: prometheus_port_env == "" and not preferred_port_check.failed - name: Set Prometheus port to fallback ansible.builtin.set_fact: prometheus_port: "{{ prometheus_port_fallback }}" - when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout and fallback_port_check.stdout is defined and "available" in fallback_port_check.stdout + when: prometheus_port_env == "" and preferred_port_check.failed and not fallback_port_check.failed - name: Fail if no Prometheus port available ansible.builtin.fail: msg: | Neither port {{ prometheus_port_preferred }} nor {{ prometheus_port_fallback }} are available! - Current port usage: - - Port {{ prometheus_port_preferred }}: IN USE - {{ preferred_port_check.stdout }} - - Port {{ prometheus_port_fallback }}: IN USE - {{ fallback_port_check.stdout }} - Please free one of these ports or stop existing Prometheus services: sudo lsof -i :{{ prometheus_port_preferred }} sudo lsof -i :{{ prometheus_port_fallback }} - when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout and fallback_port_check.stdout is defined and "available" not in fallback_port_check.stdout + when: prometheus_port_env == "" and preferred_port_check.failed and fallback_port_check.failed - name: Display selected Prometheus port ansible.builtin.debug: msg: - - "{{ '✓ Using preferred Prometheus port: ' + prometheus_port_preferred|string if 'available' in preferred_port_check.stdout else '⚠ Port ' + prometheus_port_preferred|string + ' in use, using fallback port: ' + prometheus_port_fallback|string }}" - when: prometheus_port_env == "" and preferred_port_check.stdout is defined + - "{{ '✓ Using preferred Prometheus port: ' + prometheus_port_preferred|string if not preferred_port_check.failed else '⚠ Port ' + prometheus_port_preferred|string + ' in use, using fallback port: ' + prometheus_port_fallback|string }}" + when: prometheus_port_env == "" and preferred_port_check is defined - name: Display detected configuration ansible.builtin.debug: @@ -240,11 +252,18 @@ when: is_macos register: datasource_update_macos + - name: Read current datasource version + ansible.builtin.shell: | + grep -oP 'version: \K\d+' "{{ grafana_dir }}/provisioning/datasources/prometheus.yaml" || echo "1" + register: current_version + changed_when: false + when: datasource_update_linux.changed or datasource_update_macos.changed + - name: Bump datasource version to force reload ansible.builtin.replace: path: "{{ grafana_dir }}/provisioning/datasources/prometheus.yaml" regexp: "version: \\d+" - replace: "version: {{ 99 | random(start=10) }}" + replace: "version: {{ current_version.stdout | int + 1 }}" when: datasource_update_linux.changed or datasource_update_macos.changed - name: Display datasource configuration @@ -287,7 +306,7 @@ register: container_logs changed_when: false failed_when: false - when: "'Up' not in container_status.stdout and 'running' not in container_status.stdout" + when: compose_up.rc != 0 or "Exited" in container_status.stdout or ("Up" not in container_status.stdout and "running" not in container_status.stdout) - name: Display compose output ansible.builtin.debug: @@ -404,7 +423,7 @@ block: - name: Check if tunnel already exists ansible.builtin.shell: | - ps aux | grep "ssh.*8000:localhost:8000" | grep -v grep || true + ps aux | grep "ssh.*8000:localhost:8000.*{{ dut_hostname }}" | grep -v grep || true register: existing_tunnel changed_when: false @@ -425,7 +444,7 @@ - name: Verify tunnel is active ansible.builtin.shell: | - ps aux | grep "ssh.*8000:localhost:8000" | grep -v grep + ps aux | grep "ssh.*8000:localhost:8000.*{{ dut_hostname }}" | grep -v grep register: tunnel_status changed_when: false failed_when: false diff --git a/automation/test-execution/ansible/stop-grafana.yml b/automation/test-execution/ansible/stop-grafana.yml index f451b866..36a45a3a 100644 --- a/automation/test-execution/ansible/stop-grafana.yml +++ b/automation/test-execution/ansible/stop-grafana.yml @@ -16,6 +16,7 @@ vars: grafana_dir: "{{ playbook_dir }}/../grafana" compose_project_name: vllm-monitoring + compose_file: "{{ 'docker-compose.macos.yml' if ansible_os_family == 'Darwin' else 'docker-compose.yml' }}" # remove_volumes can be passed via -e "remove_volumes=true" remove_volumes_flag: "{{ remove_volumes | default(false) | bool }}" @@ -32,6 +33,12 @@ failed_when: false changed_when: false + - name: Check for docker compose (CLI v2) + ansible.builtin.command: docker compose version + register: docker_cli_compose_check + failed_when: false + changed_when: false + - name: Check for podman-compose ansible.builtin.command: podman-compose --version register: podman_compose_check @@ -44,19 +51,29 @@ runtime_name: Docker when: docker_compose_check.rc == 0 + - name: Set compose command (docker compose CLI v2) + ansible.builtin.set_fact: + compose_cmd: docker compose + runtime_name: Docker + when: + - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc == 0 + - name: Set compose command (podman-compose) ansible.builtin.set_fact: compose_cmd: podman-compose runtime_name: Podman when: - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc != 0 - podman_compose_check.rc == 0 - name: Fail if no compose tool found ansible.builtin.fail: - msg: "Neither docker-compose nor podman-compose found!" + msg: "Neither docker-compose, docker compose, nor podman-compose found!" when: - docker_compose_check.rc != 0 + - docker_cli_compose_check.rc != 0 - podman_compose_check.rc != 0 - name: Display detected configuration @@ -119,14 +136,14 @@ block: - name: Stop Grafana stack (preserving volumes) ansible.builtin.command: - cmd: "{{ compose_cmd }} down" + cmd: "{{ compose_cmd }} -f {{ compose_file }} down" chdir: "{{ grafana_dir }}" register: compose_down when: not remove_volumes_flag - name: Stop Grafana stack (removing volumes) ansible.builtin.command: - cmd: "{{ compose_cmd }} down -v" + cmd: "{{ compose_cmd }} -f {{ compose_file }} down -v" chdir: "{{ grafana_dir }}" register: compose_down_volumes when: remove_volumes_flag @@ -139,7 +156,7 @@ - name: Verify containers stopped ansible.builtin.command: - cmd: "{{ compose_cmd }} ps" + cmd: "{{ compose_cmd }} -f {{ compose_file }} ps" chdir: "{{ grafana_dir }}" register: compose_ps_final changed_when: false diff --git a/automation/test-execution/dashboard-examples/README.md b/automation/test-execution/dashboard-examples/README.md index 26547f2a..aed12337 100644 --- a/automation/test-execution/dashboard-examples/README.md +++ b/automation/test-execution/dashboard-examples/README.md @@ -30,7 +30,7 @@ This Streamlit dashboard provides comprehensive analysis of vLLM benchmark resul **Server-Side Metrics (vLLM)** - Internal server state - Queue depth, cache usage, token generation rates -- Source: `vllm-metrics.json` (collected via Prometheus) +- Source: `vllm-metrics.json` (scraped directly from `/metrics` endpoint) **Architecture:** ``` @@ -38,7 +38,7 @@ Benchmark Run ↓ GuideLLM → benchmarks.json (client metrics) ↓ -vLLM Server → Prometheus → vllm-metrics.json (server metrics) +vLLM Server /metrics → vllm-metrics.json (server metrics) ↓ Streamlit Dashboard → Analysis + Visualization ``` @@ -218,7 +218,7 @@ ansible-playbook start-grafana.yml results/llm/model-name/test-date/config/ ├── benchmarks.json # GuideLLM results (client-side) ├── test-metadata.json # Test configuration -├── vllm-metrics.json # vLLM server metrics (from Prometheus) +├── vllm-metrics.json # vLLM server metrics (scraped from /metrics) └── vllm-server.log # Server logs ``` @@ -267,7 +267,7 @@ results/llm/ └── vllm-server.log ← Server logs ``` -**Default path:** `../../../../../results/llm` (relative to dashboard pages) +**Default path:** `../../../../results/llm` (relative to dashboard pages) ## Configuration diff --git a/automation/test-execution/grafana/README.md b/automation/test-execution/grafana/README.md index 264be142..ed0378b3 100644 --- a/automation/test-execution/grafana/README.md +++ b/automation/test-execution/grafana/README.md @@ -324,7 +324,9 @@ scrape_configs: - job_name: 'vllm-live' scrape_interval: 10s static_configs: - - targets: ['host.containers.internal:8000'] + # macOS: use host.containers.internal:8000 + # Linux: use localhost:8000 + - targets: [':8000'] metric_relabel_configs: - source_labels: [__name__] regex: 'vllm_.*' @@ -336,6 +338,9 @@ scrape_configs: - **Scrape interval:** 10s for real-time monitoring - **Retention:** 30 days by default - **Storage:** Local volume (persists between restarts) +- **Target host:** Platform-specific + - macOS: Use `host.containers.internal:8000` (bridge networking) + - Linux: Use `localhost:8000` (host networking) ### Customization From e18f60ac7fe70bca48784c10f83a6663016354e1 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 2 Apr 2026 16:48:39 +0100 Subject: [PATCH 4/5] fix: align compose tool priority and improve Grafana container management - Check podman-compose FIRST in both start/stop playbooks - Ensures containers started with one tool are stopped with the same tool - Fix stop-grafana to detect actual container runtime managing containers - Priority order: podman-compose > docker-compose > docker compose CLI v2 Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .../test-execution/ansible/start-grafana.yml | 66 ++++++++++--------- .../test-execution/ansible/stop-grafana.yml | 34 +++++----- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/automation/test-execution/ansible/start-grafana.yml b/automation/test-execution/ansible/start-grafana.yml index ad191be2..3ef159ef 100644 --- a/automation/test-execution/ansible/start-grafana.yml +++ b/automation/test-execution/ansible/start-grafana.yml @@ -60,6 +60,12 @@ - name: Auto-detect compose tool block: + - name: Check for podman-compose + ansible.builtin.command: podman-compose --version + register: podman_compose_check + failed_when: false + changed_when: false + - name: Check for docker-compose ansible.builtin.command: docker-compose --version register: docker_compose_check @@ -72,47 +78,41 @@ failed_when: false changed_when: false - - name: Check for podman-compose - ansible.builtin.command: podman-compose --version - register: podman_compose_check - failed_when: false - changed_when: false + - name: Set compose command (podman-compose) + ansible.builtin.set_fact: + compose_cmd: podman-compose + runtime_name: Podman + when: podman_compose_check.rc == 0 - name: Set compose command (docker-compose) ansible.builtin.set_fact: compose_cmd: docker-compose runtime_name: Docker - when: docker_compose_check.rc == 0 + when: + - podman_compose_check.rc != 0 + - docker_compose_check.rc == 0 - name: Set compose command (docker compose CLI v2) ansible.builtin.set_fact: compose_cmd: docker compose runtime_name: Docker when: + - podman_compose_check.rc != 0 - docker_compose_check.rc != 0 - docker_cli_compose_check.rc == 0 - - name: Set compose command (podman-compose) - ansible.builtin.set_fact: - compose_cmd: podman-compose - runtime_name: Podman - when: - - docker_compose_check.rc != 0 - - docker_cli_compose_check.rc != 0 - - podman_compose_check.rc == 0 - - name: Fail if no compose tool found ansible.builtin.fail: msg: | - Neither docker-compose, docker compose, nor podman-compose found! + Neither podman-compose, docker-compose, nor docker compose found! Install one of the following: - Docker Desktop (includes docker-compose or docker compose) - podman-compose: pip3 install podman-compose when: + - podman_compose_check.rc != 0 - docker_compose_check.rc != 0 - docker_cli_compose_check.rc != 0 - - podman_compose_check.rc != 0 when: compose_cmd_override == "" # ======================================================================== @@ -160,48 +160,52 @@ when: prometheus_port_env != "" - name: Check if preferred Prometheus port is in use - ansible.builtin.wait_for: - port: "{{ prometheus_port_preferred }}" - state: stopped - timeout: 1 + ansible.builtin.shell: | + ss -tuln | grep -E ':{{ prometheus_port_preferred }}\s' || netstat -tuln 2>/dev/null | grep -E ':{{ prometheus_port_preferred }}\s' || echo "available" register: preferred_port_check + changed_when: false failed_when: false when: prometheus_port_env == "" - name: Check if fallback Prometheus port is in use - ansible.builtin.wait_for: - port: "{{ prometheus_port_fallback }}" - state: stopped - timeout: 1 + ansible.builtin.shell: | + ss -tuln | grep -E ':{{ prometheus_port_fallback }}\s' || netstat -tuln 2>/dev/null | grep -E ':{{ prometheus_port_fallback }}\s' || echo "available" register: fallback_port_check + changed_when: false failed_when: false - when: prometheus_port_env == "" and preferred_port_check.failed + when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout - name: Set Prometheus port to preferred ansible.builtin.set_fact: prometheus_port: "{{ prometheus_port_preferred }}" - when: prometheus_port_env == "" and not preferred_port_check.failed + when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" in preferred_port_check.stdout - name: Set Prometheus port to fallback ansible.builtin.set_fact: prometheus_port: "{{ prometheus_port_fallback }}" - when: prometheus_port_env == "" and preferred_port_check.failed and not fallback_port_check.failed + when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout and fallback_port_check.stdout is defined and "available" in fallback_port_check.stdout - name: Fail if no Prometheus port available ansible.builtin.fail: msg: | Neither port {{ prometheus_port_preferred }} nor {{ prometheus_port_fallback }} are available! + Current port usage: + - Port {{ prometheus_port_preferred }}: IN USE + {{ preferred_port_check.stdout }} + - Port {{ prometheus_port_fallback }}: IN USE + {{ fallback_port_check.stdout }} + Please free one of these ports or stop existing Prometheus services: sudo lsof -i :{{ prometheus_port_preferred }} sudo lsof -i :{{ prometheus_port_fallback }} - when: prometheus_port_env == "" and preferred_port_check.failed and fallback_port_check.failed + when: prometheus_port_env == "" and preferred_port_check.stdout is defined and "available" not in preferred_port_check.stdout and fallback_port_check.stdout is defined and "available" not in fallback_port_check.stdout - name: Display selected Prometheus port ansible.builtin.debug: msg: - - "{{ '✓ Using preferred Prometheus port: ' + prometheus_port_preferred|string if not preferred_port_check.failed else '⚠ Port ' + prometheus_port_preferred|string + ' in use, using fallback port: ' + prometheus_port_fallback|string }}" - when: prometheus_port_env == "" and preferred_port_check is defined + - "{{ '✓ Using preferred Prometheus port: ' + prometheus_port_preferred|string if 'available' in preferred_port_check.stdout else '⚠ Port ' + prometheus_port_preferred|string + ' in use, using fallback port: ' + prometheus_port_fallback|string }}" + when: prometheus_port_env == "" and preferred_port_check.stdout is defined - name: Display detected configuration ansible.builtin.debug: diff --git a/automation/test-execution/ansible/stop-grafana.yml b/automation/test-execution/ansible/stop-grafana.yml index 36a45a3a..4bd534e7 100644 --- a/automation/test-execution/ansible/stop-grafana.yml +++ b/automation/test-execution/ansible/stop-grafana.yml @@ -27,6 +27,12 @@ - name: Detect available compose tool block: + - name: Check for podman-compose + ansible.builtin.command: podman-compose --version + register: podman_compose_check + failed_when: false + changed_when: false + - name: Check for docker-compose ansible.builtin.command: docker-compose --version register: docker_compose_check @@ -39,42 +45,36 @@ failed_when: false changed_when: false - - name: Check for podman-compose - ansible.builtin.command: podman-compose --version - register: podman_compose_check - failed_when: false - changed_when: false + - name: Set compose command (podman-compose) + ansible.builtin.set_fact: + compose_cmd: podman-compose + runtime_name: Podman + when: podman_compose_check.rc == 0 - name: Set compose command (docker-compose) ansible.builtin.set_fact: compose_cmd: docker-compose runtime_name: Docker - when: docker_compose_check.rc == 0 + when: + - podman_compose_check.rc != 0 + - docker_compose_check.rc == 0 - name: Set compose command (docker compose CLI v2) ansible.builtin.set_fact: compose_cmd: docker compose runtime_name: Docker when: + - podman_compose_check.rc != 0 - docker_compose_check.rc != 0 - docker_cli_compose_check.rc == 0 - - name: Set compose command (podman-compose) - ansible.builtin.set_fact: - compose_cmd: podman-compose - runtime_name: Podman - when: - - docker_compose_check.rc != 0 - - docker_cli_compose_check.rc != 0 - - podman_compose_check.rc == 0 - - name: Fail if no compose tool found ansible.builtin.fail: - msg: "Neither docker-compose, docker compose, nor podman-compose found!" + msg: "Neither podman-compose, docker-compose, nor docker compose found!" when: + - podman_compose_check.rc != 0 - docker_compose_check.rc != 0 - docker_cli_compose_check.rc != 0 - - podman_compose_check.rc != 0 - name: Display detected configuration ansible.builtin.debug: From bb64be7ad58ea5fa223c53527dfd061bc7160b71 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 2 Apr 2026 17:30:15 +0100 Subject: [PATCH 5/5] fix: update vLLM dashboard to use correct datasource UID Changed datasource UID from "edx8memhpd9tsa" to "prometheus-vllm" to match the actual provisioned datasource defined in prometheus.yaml. This fixes Grafana not displaying metrics despite Prometheus successfully scraping vLLM. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .../grafana/dashboards/vllm_comprehensive.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/automation/test-execution/grafana/dashboards/vllm_comprehensive.json b/automation/test-execution/grafana/dashboards/vllm_comprehensive.json index b58882c9..bcefa551 100644 --- a/automation/test-execution/grafana/dashboards/vllm_comprehensive.json +++ b/automation/test-execution/grafana/dashboards/vllm_comprehensive.json @@ -1247,7 +1247,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus-vllm" }, "disableTextWrap": false, "editorMode": "code", @@ -1347,7 +1347,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus-vllm" }, "disableTextWrap": false, "editorMode": "code", @@ -1460,7 +1460,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus-vllm" }, "disableTextWrap": false, "editorMode": "code",