From 7c06cbc739c05f272a3987a866ecd6c7d04ffb92 Mon Sep 17 00:00:00 2001 From: Kaiyi Date: Mon, 2 Dec 2024 19:41:56 -0500 Subject: [PATCH 1/5] feat(baremetal_validator): Add Validator for Process, Container Baremetal Metrics This commit includes validation setups for Process and Container RAPL related power metrics (node, process, container). Signed-off-by: Kaiyi --- e2e/tools/validator/pyproject.toml | 1 + .../validator/scripts/targeted_stresser.sh | 85 +++++++ .../src/validator/stresser/__init__.py | 209 +++++++++++++++++- 3 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 e2e/tools/validator/scripts/targeted_stresser.sh diff --git a/e2e/tools/validator/pyproject.toml b/e2e/tools/validator/pyproject.toml index ac6e673d9a..eac8d556f7 100644 --- a/e2e/tools/validator/pyproject.toml +++ b/e2e/tools/validator/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "pandas", "matplotlib", "scikit-learn", + "docker", ] [project.scripts] diff --git a/e2e/tools/validator/scripts/targeted_stresser.sh b/e2e/tools/validator/scripts/targeted_stresser.sh new file mode 100644 index 0000000000..3ba379f2c8 --- /dev/null +++ b/e2e/tools/validator/scripts/targeted_stresser.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash + +set -eu -o pipefail + +trap exit_all INT +exit_all() { + pkill -P $$ +} + +run() { + echo "❯ $*" + "$@" + echo " ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾" +} + +usage() { + echo "Usage: $0 -g -r -c -d -t -l -n " + echo " -g If set, and are ignored." + echo " -r CPU range for stress-ng taskset (Default: '15')" + echo " -c Number of CPUs to use for stress-ng (Default: '1')" + echo " -d Directory to mount for logging (Default: '/tmp')" + echo " -t Filename for start and end time file log (Default: 'time_interval.log')" + echo " -l Load curve as a comma-separated list (Default: '0:5,50:20,75:20,100:20,75:20,50:20')" + echo " -n Number of times to iterate the Load curve (Default: '1')" + exit 1 +} + +main() { + + set_general_mode=false + DEFAULT_CPU_RANGE="15" + DEFAULT_CPUS="1" + DEFAULT_MOUNT_DIR="/tmp" + DEFAULT_LOAD_CURVE_STR="0:5,50:20,75:20,100:20,75:20,50:20" + DEFAULT_TIME_INTERVAL_LOG_NAME="time_interval.log" + DEFAULT_ITERATIONS="1" + + # Parse command-line options + while getopts "g:r:c:d:t:l:n:" opt; do + case "$opt" in + g) set_general_mode=true ;; + r) cpu_range="$OPTARG" ;; + c) cpus="$OPTARG" ;; + d) mount_dir="$OPTARG" ;; + t) time_interval_log_name="$OPTARG" ;; + l) load_curve_str="$OPTARG" ;; + n) iterations="$OPTARG" ;; + *) usage ;; + esac + done + + cpu_range="${cpu_range:-$DEFAULT_CPU_RANGE}" + cpus="${cpus:-$DEFAULT_CPUS}" + mount_dir="${mount_dir:-$DEFAULT_MOUNT_DIR}" + time_interval_log_name="${time_interval_log_name:-$DEFAULT_TIME_INTERVAL_LOG_NAME}" + load_curve_str="${load_curve_str:-$DEFAULT_LOAD_CURVE_STR}" + iterations="${iterations:-$DEFAULT_ITERATIONS}" + + IFS=',' read -r -a load_curve <<< "$load_curve_str" + + TIME_INTERVAL_LOG="${mount_dir}/${time_interval_log_name}" + + > "$TIME_INTERVAL_LOG" + + start_time=$(date +%s) + echo "Stress Start Time: $start_time" >> "$TIME_INTERVAL_LOG" + + for i in $(seq 1 "$iterations"); do + echo "Running $i/$iterations" + for x in "${load_curve[@]}"; do + local load="${x%%:*}" + local time="${x##*:}s" + if $set_general_mode; then + run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" + else + run taskset -c "$cpu_range" stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" + fi + done + done + + end_time=$(date +%s) + echo "Stress End Time: $end_time" >> "$TIME_INTERVAL_LOG" +} + +main "$@" \ No newline at end of file diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py index 01afec1181..425c9f3809 100644 --- a/e2e/tools/validator/src/validator/stresser/__init__.py +++ b/e2e/tools/validator/src/validator/stresser/__init__.py @@ -1,8 +1,14 @@ import logging from datetime import datetime -from typing import NamedTuple +from typing import NamedTuple, List, Iterable import paramiko +import os +import subprocess +import time +import docker +import psutil + from validator import config @@ -113,3 +119,204 @@ def run(self, cmd: str, *args) -> RunResult: stderr=stderr.read().decode("ascii").strip("\n"), exit_code=exit_status, ) + + +class ProcessOutput(NamedTuple): + script_result: ScriptResult + relevant_pids: Iterable[str] + + +class ContainerOutput(NamedTuple): + ScriptResult: ScriptResult + container_id: str + + +def return_child_pids(parent_pid: int) -> List[int]: + try: + parent_process = psutil.Process(parent_pid) + children_processes = parent_process.children(recursive=True) + return [child_process.pid for child_process in children_processes] + except psutil.NoSuchProcess: + return [] + +def retrieve_time_interval_from_log(time_interval_filepath): + start_time = None + end_time = None + with open(file=time_interval_filepath, mode="r") as f: + for line in f.readlines(): + if line.startswith("Stress Start Time:"): + start_timestamp = (line.split(":")[-1]).strip() + start_time = datetime.fromtimestamp(float(start_timestamp)) + if line.startswith("Stress End Time:"): + end_timestamp = (line.split(":")[-1]).strip() + end_time = datetime.fromtimestamp(float(end_timestamp)) + return start_time, end_time + + +class Local: + def __init__(self, config: config.Local): + self.load_curve = config.load_curve + self.iterations = config.iterations + self.mount_dir = config.mount_dir + self.time_range_log = os.path.join(self.mount_dir, "time_interval.log") + stresser_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) + self.stresser_script =os.path.join(stresser_dir, "scripts", "targeted_stresser.sh") + + def __repr__(self): + return f" Local Stresser\nLoad Curve: {self.load_curve}" + + def stress(self): + logger.info("stressing in local mode") + command = f"{self.stresser_script} -g -d {self.mount_dir} -l {self.load_curve} -n {self.iterations}" + logger.info("running stress command -> %s", command) + print(command) + result = subprocess.run(command, shell=True, check=True, text=True, capture_output=True) + print(f"Output:\n{result.stdout}") + status_code = result.returncode + if status_code != 0: + logger.error("stresser command failed -> %s", command) + raise StresserError( + script_exit_code=status_code, + message="status code is non zero" + ) + start_time, end_time = retrieve_time_interval_from_log(self.time_range_log) + if start_time is None or end_time is None: + logger.error("start time or end time is empty") + raise StresserError( + start_time=start_time, + end_time=end_time, + message="start time or end time is empty" + ) + return ScriptResult( + start_time=start_time, + end_time=end_time + ) + + +class Process(Local): + def __init__(self, config: config.Local): + super().__init__(config) + self.isolated_cpu = config.isolated_cpu + + def __repr__(self): + return f" Process Stresser\nLoad Curve: {self.load_curve}" + + def stress(self): + logger.info("stressing in process mode -> %s", self.isolated_cpu) + command = f"{self.stresser_script} -r '{self.isolated_cpu}' -d {self.mount_dir} -l {self.load_curve} -n {self.iterations}" + logger.info("running stress command -> %s", command) + target_popen = subprocess.Popen(command, shell=True) + time.sleep(1) + target_process_pid = target_popen.pid + all_child_pids = set([target_process_pid]) + while target_popen.poll() is None: + child_pids = return_child_pids(target_process_pid) + all_child_pids = all_child_pids.union(child_pids) + time.sleep(1) + logger.info("captured pids -> %s", all_child_pids) + print(f"captured pids: {all_child_pids}") + status_code = target_popen.returncode + if status_code != 0: + logger.error("stresser command failed -> %s", command) + raise StresserError( + script_exit_code=status_code, + message="status code is non zero" + ) + + start_time, end_time = retrieve_time_interval_from_log(self.time_range_log) + + if start_time is None or end_time is None: + logger.error("start time or end time is empty") + raise StresserError( + start_time=start_time, + end_time=end_time, + message="start time or end time is empty" + ) + + return ProcessOutput( + script_result=ScriptResult( + start_time=start_time, + end_time=end_time + ), + relevant_pids=all_child_pids + ) + + +class Container(Local): + def __init__(self, config: config.Local): + super().__init__(config) + self.isolated_cpu = config.isolated_cpu + self.container_name = config.container_name + self.client = docker.from_env() + + def __repr__(self): + return f" Container Stresser\nLoad Curve: {self.load_curve}" + + def stress(self) -> ContainerOutput: + logger.info("stressing in container mode -> %s", self.isolated_cpu) + image = "fedora:latest" + command = f"bash -c 'dnf update -y && dnf install -y stress-ng && bash /app/stresser_script.sh \ + -r \"{self.isolated_cpu}\" -d \"{self.mount_dir}\" \ + -l \"{self.load_curve}\" -n \"{self.iterations}\"'" + self.client.images.pull(image) + stress_container = self.client.containers.run( + image=image, + name=self.container_name, + command=command, + volumes={ + self.stresser_script: {'bind': '/app/stresser_script.sh', 'mode': 'ro'}, + self.mount_dir :{'bind': self.mount_dir, 'mode': 'rw'} + }, + remove=False, + detach=True + + ) + id = stress_container.id + logger.info("captured container id -> %s", id) + print(f"captured container id: {id}") + status_map = stress_container.wait() + container_logs = stress_container.logs().decode("utf-8") + logger.info("container logs ->\n%s", container_logs) + print(f"container logs:\n{container_logs}") + stress_container.remove() + print(status_map) + + status_code = status_map["StatusCode"] + if status_map["StatusCode"] != 0: + logger.error("stresser command failed -> %s", command) + raise StresserError( + script_exit_code=status_code, + message="status code is non zero" + ) + + start_time, end_time = retrieve_time_interval_from_log(self.time_range_log) + + if not start_time or not end_time: + logger.error("start time or end time is empty") + raise StresserError( + start_time=start_time, + end_time=end_time, + message="start time or end time is empty" + ) + + logger.info("Stress Start Time: %s\nStress End Time: %s", start_time, end_time) + print(start_time, end_time) + return ContainerOutput( + script_result=ScriptResult( + start_time=start_time, + end_time=end_time + ), + container_id=id, + ) + + +class StresserError(Exception): + def __init__(self, start_time=None, end_time=None, script_exit_code=0, message=""): + super().__init__(message) + self.start_time = start_time + self.end_time = end_time + self.script_exit_code = script_exit_code + + def __str__(self) -> str: + base_message = super().__str__() + return f"Start Time: {self.start_time}\nEnd Time: {self.end_time}\nScript Code: {self.script_exit_code}\nMessage: {base_message}" \ No newline at end of file From ad0e7f88287f4ed334085cb93830656ebff159e5 Mon Sep 17 00:00:00 2001 From: Kaiyi Date: Tue, 3 Dec 2024 16:57:14 -0500 Subject: [PATCH 2/5] feat(bm-validation): Add config setup for Baremetal Validator Added bload to load configuration classes for process, container, and node level baremetal validation. Signed-off-by: Kaiyi --- .../src/validator/config/__init__.py | 107 ++++++++++++++++++ .../src/validator/stresser/__init__.py | 24 ++-- 2 files changed, 119 insertions(+), 12 deletions(-) diff --git a/e2e/tools/validator/src/validator/config/__init__.py b/e2e/tools/validator/src/validator/config/__init__.py index 31fed7b14b..d1e1802de8 100644 --- a/e2e/tools/validator/src/validator/config/__init__.py +++ b/e2e/tools/validator/src/validator/config/__init__.py @@ -57,6 +57,113 @@ def __repr__(self): return f"" +# consider switching to dataclass to avoid repeated fields +class Local(NamedTuple): + load_curve: str + iterations: str + mount_dir: str + + +class LocalProcess(NamedTuple): + isolated_cpu: str + load_curve: str + iterations: str + mount_dir: str + + +class LocalContainer(NamedTuple): + isolated_cpu: str + container_name: str + load_curve: str + iterations: str + mount_dir: str + + +class LocalPrometheus(NamedTuple): + url: str + rate_interval: str + step: str + job: str + + +class BMValidator(NamedTuple): + log_level: str + prom: LocalPrometheus + node: Local + process: LocalProcess + container: LocalContainer + validations_file: str + + +def bload(config_file: str) -> BMValidator: + """ + Reads Baremetal YAML configuration file and returns a Config object. + + Args: + config_file (str): Path to Baremetal YAML configuration file. + + Returns: + BMValidator: A named tuple containing configuration values for Baremetal Validation. + """ + with open(config_file) as file: + config = yaml.safe_load(file) + + log_level = config.get("log_level", "warn") + prom_config = config["prometheus"] + if not prom_config: + prom_config = {} + prom = LocalPrometheus( + url=prom_config.get("url", "http://localhost:9090"), + rate_interval=prom_config.get("rate_interval", "20s"), + step=prom_config.get("step", "3s"), + job=prom_config.get("job", "metal") + ) + print(prom) + + default_config = config["config"] + node_config = config["node"] + process_config = config["process"] + container_config = config["container"] + # node config + if not node_config: + node_config = {} + node = Local( + load_curve=node_config.get("load_curve", default_config["load_curve"]), + iterations=node_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(node_config.get("mount_dir", default_config["mount_dir"])) + ) + print(node) + if not process_config: + process_config = {} + process = LocalProcess( + isolated_cpu=process_config.get("isolated_cpu", default_config["isolated_cpu"]), + load_curve=process_config.get("load_curve", default_config["load_curve"]), + iterations=process_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(process_config.get("mount_dir", default_config["mount_dir"])) + ) + print(process) + if not container_config: + container_config = {} + container = LocalContainer( + isolated_cpu=container_config.get("isolated_cpu", default_config["isolated_cpu"]), + container_name=container_config.get("container_name", default_config["container_name"]), + load_curve=container_config.get("load_curve", default_config["load_curve"]), + iterations=container_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(container_config.get("mount_dir", default_config["mount_dir"])) + ) + print(container) + + validations_file = config.get("validations_file", "bm_validations.yaml") + + BMValidator( + log_level=log_level, + prom=prom, + node=node, + process=process, + container=container, + validations_file=validations_file + ) + def load(config_file: str) -> Validator: """ Reads the YAML configuration file and returns a Config object. diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py index 425c9f3809..d84a8f0886 100644 --- a/e2e/tools/validator/src/validator/stresser/__init__.py +++ b/e2e/tools/validator/src/validator/stresser/__init__.py @@ -154,10 +154,10 @@ def retrieve_time_interval_from_log(time_interval_filepath): class Local: - def __init__(self, config: config.Local): - self.load_curve = config.load_curve - self.iterations = config.iterations - self.mount_dir = config.mount_dir + def __init__(self, lc: config.Local): + self.load_curve = lc.load_curve + self.iterations = lc.iterations + self.mount_dir = lc.mount_dir self.time_range_log = os.path.join(self.mount_dir, "time_interval.log") stresser_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) self.stresser_script =os.path.join(stresser_dir, "scripts", "targeted_stresser.sh") @@ -194,9 +194,9 @@ def stress(self): class Process(Local): - def __init__(self, config: config.Local): - super().__init__(config) - self.isolated_cpu = config.isolated_cpu + def __init__(self, pc: config.LocalProcess): + super().__init__(pc) + self.isolated_cpu = pc.isolated_cpu def __repr__(self): return f" Process Stresser\nLoad Curve: {self.load_curve}" @@ -243,10 +243,10 @@ def stress(self): class Container(Local): - def __init__(self, config: config.Local): - super().__init__(config) - self.isolated_cpu = config.isolated_cpu - self.container_name = config.container_name + def __init__(self, cc: config.LocalContainer): + super().__init__(cc) + self.isolated_cpu = cc.isolated_cpu + self.container_name = cc.container_name self.client = docker.from_env() def __repr__(self): @@ -280,7 +280,7 @@ def stress(self) -> ContainerOutput: print(f"container logs:\n{container_logs}") stress_container.remove() print(status_map) - + status_code = status_map["StatusCode"] if status_map["StatusCode"] != 0: logger.error("stresser command failed -> %s", command) From 15106d7728fc16e75c60119a28b76508a1a1b57a Mon Sep 17 00:00:00 2001 From: Kaiyi Date: Mon, 9 Dec 2024 22:11:28 -0500 Subject: [PATCH 3/5] feat(baremetal-validator): Add CLI for running baremetal validations Added process, container, node/local validation to the cli. Signed-off-by: Kaiyi --- .../validator/src/validator/cli/__init__.py | 159 ++++++++++++++++-- .../src/validator/config/__init__.py | 95 ++++++----- .../src/validator/stresser/__init__.py | 2 +- .../src/validator/validations/__init__.py | 48 +++++- e2e/tools/validator/validator.bm.yaml | 25 +++ 5 files changed, 271 insertions(+), 58 deletions(-) create mode 100644 e2e/tools/validator/validator.bm.yaml diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index 53a6a98ce2..a1f608ef34 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -25,12 +25,12 @@ from validator.prometheus import Comparator, PrometheusClient, Series, ValueOrError from validator.report import CustomEncoder, JsonTemplate from validator.specs import MachineSpec, get_host_spec, get_vm_spec -from validator.stresser import Remote, ScriptResult -from validator.validations import Loader, QueryTemplate, Validation +from validator.stresser import Remote, ScriptResult, Local, Process, Container +from validator.validations import Loader, BLoader, QueryTemplate, Validation logger = logging.getLogger(__name__) pass_config = click.make_pass_decorator(config.Validator) - +pass_bm_config = click.make_pass_decorator(config.BMValidator) @dataclass class ValidationResult: @@ -364,6 +364,20 @@ def dump_query_result(raw_results_dir: str, prefix: str, query: QueryTemplate, s return out_file +def setup_validator(ctx: click.Context, config_file: str, log_level: str, loader): + cfg = loader(config_file) + log_level = cfg.log_level if log_level == "config" else log_level + try: + level = getattr(logging, log_level.upper()) + except AttributeError: + # ruff: noqa: T201 + print(f"Invalid log level: {cfg.log_level}; setting to debug") + level = logging.DEBUG + + logging.basicConfig(level=level) + ctx.obj = cfg + + @click.group( context_settings={"help_option_names": ["-h", "--help"]}, invoke_without_command=False, @@ -385,17 +399,42 @@ def dump_query_result(raw_results_dir: str, prefix: str, query: QueryTemplate, s ) @click.pass_context def validator(ctx: click.Context, config_file: str, log_level: str): - cfg = config.load(config_file) - log_level = cfg.log_level if log_level == "config" else log_level - try: - level = getattr(logging, log_level.upper()) - except AttributeError: - # ruff: noqa: T201 (Suppressed as an early print statement before logging level is set) - print(f"Invalid log level: {cfg.log_level}; setting to debug") - level = logging.DEBUG + setup_validator(ctx, config_file, log_level, config.load) + # cfg = config.load(config_file) + # log_level = cfg.log_level if log_level == "config" else log_level + # try: + # level = getattr(logging, log_level.upper()) + # except AttributeError: + # # ruff: noqa: T201 (Suppressed as an early print statement before logging level is set) + # print(f"Invalid log level: {cfg.log_level}; setting to debug") + # level = logging.DEBUG - logging.basicConfig(level=level) - ctx.obj = cfg + # logging.basicConfig(level=level) + # ctx.obj = cfg + + +@click.group( + context_settings={"help_option_names": ["-h", "--help"]}, + invoke_without_command=False, +) +@click.version_option(version=__version__, prog_name="bm_validator") +@click.option( + "--log-level", + "-l", + type=click.Choice(["debug", "info", "warn", "error", "config"]), + default="config", + required=False, +) +@click.option( + "--config-file", + "-f", + default="validator.bm.yaml", + type=click.Path(exists=True), + show_default=True, +) +@click.pass_context +def bm_validator(ctx: click.Context, config_file: str, log_level: str): + setup_validator(ctx, config_file, log_level, config.bload) @validator.command() @@ -620,6 +659,100 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di return int(res.validations.passed) +@bm_validator.command() +# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory) +@click.option( + "--report-dir", + "-o", + default="/tmp", + type=click.Path(exists=True, dir_okay=True, writable=True), + show_default=True, +) +@pass_config +def regression( + cfg: config.BMValidator, + report_dir: str, +): + """ + Run Kepler Baremetal Validation Tests + """ + click.secho(" * Generating report dir and tag", fg="green") + results_dir, tag = create_report_dir(report_dir) + click.secho(f"\tresults dir: {results_dir}, tag: {tag}", fg="bright_green") + res = TestResult(tag) + res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) + res.start_time = datetime.datetime.now() + click.secho(" * Generating spec report ...", fg="green") + res.host_spec = get_host_spec() + validation_results = [] + click.secho(" * Running stress test ...", fg="green") + if cfg.node: + click.secho(" * Running node stress test ...", fg="blue") + local_stress = Local( + lc=cfg.node + ) + local_stress_test = local_stress.stress() + start_time = local_stress_test.start_time + end_time = local_stress_test.end_time + + # sleep a bit for prometheus to finish scrapping + click.secho(" * Sleeping for 10 seconds ...", fg="green") + time.sleep(10) + click.secho(" * Acquiring local stress validations ...", fg="green") + prom = PrometheusClient(cfg.prometheus) + comparator = Comparator(prom) + validations = BLoader(cfg).load_node_validations() + + validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) + + if cfg.process: + click.secho(" * Running process stress test ...", fg="blue") + stress_process = Process( + pc=cfg.process + ) + process_stress_test = stress_process.stress() + start_time = process_stress_test.script_result.start_time + end_time = process_stress_test.script_result.end_time + relevant_pids = process_stress_test.relevant_pids + + # sleep a bit for prometheus to finish scrapping + click.secho(" * Sleeping for 10 seconds ...", fg="green") + time.sleep(10) + click.secho(" * Acquiring process stress validations ...", fg="green") + prom = PrometheusClient(cfg.prometheus) + comparator = Comparator(prom) + validations = BLoader(cfg).load_process_validations(relevant_pids) + + validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) + + if cfg.container: + click.secho(" * Running container stress test ...", fg="blue") + container_process = Container( + cc=cfg.container + ) + container_stress_test = container_process.stress() + start_time = container_stress_test.script_result.start_time + end_time = container_stress_test.script_result.end_time + container_id = container_stress_test.container_id + + # sleep a bit for prometheus to finish scrapping + click.secho(" * Sleeping for 10 seconds ...", fg="green") + time.sleep(10) + click.secho(" * Acquiring container stress validations ...", fg="green") + prom = PrometheusClient(cfg.prometheus) + comparator = Comparator(prom) + validations = BLoader(cfg).load_container_validations(container_id) + + validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) + + res.end_time = datetime.datetime.now() + + res.validations = validation_results + write_json_report(results_dir, res) + write_md_report(results_dir, res) + + return int(res.validations.passed) + def write_json_report(results_dir: str, res: TestResult): pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?') diff --git a/e2e/tools/validator/src/validator/config/__init__.py b/e2e/tools/validator/src/validator/config/__init__.py index d1e1802de8..411506460b 100644 --- a/e2e/tools/validator/src/validator/config/__init__.py +++ b/e2e/tools/validator/src/validator/config/__init__.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: APACHE-2.0 import os -from typing import NamedTuple +from typing import NamedTuple, Optional import yaml @@ -79,19 +79,19 @@ class LocalContainer(NamedTuple): mount_dir: str -class LocalPrometheus(NamedTuple): - url: str - rate_interval: str - step: str - job: str +# class LocalPrometheus(NamedTuple): +# url: str +# rate_interval: str +# step: str +# job_name: str class BMValidator(NamedTuple): log_level: str - prom: LocalPrometheus - node: Local - process: LocalProcess - container: LocalContainer + prometheus: Prometheus + node: Optional[Local] + process: Optional[LocalProcess] + container: Optional[LocalContainer] validations_file: str @@ -112,52 +112,63 @@ def bload(config_file: str) -> BMValidator: prom_config = config["prometheus"] if not prom_config: prom_config = {} - prom = LocalPrometheus( + prom = Prometheus( url=prom_config.get("url", "http://localhost:9090"), rate_interval=prom_config.get("rate_interval", "20s"), step=prom_config.get("step", "3s"), - job=prom_config.get("job", "metal") + job=PrometheusJob( + metal=prom_config.get("job", "metal"), + vm="", + ) ) print(prom) default_config = config["config"] - node_config = config["node"] - process_config = config["process"] - container_config = config["container"] - # node config - if not node_config: - node_config = {} - node = Local( - load_curve=node_config.get("load_curve", default_config["load_curve"]), - iterations=node_config.get("iterations", default_config["iterations"]), - mount_dir=os.path.expanduser(node_config.get("mount_dir", default_config["mount_dir"])) - ) + + node = None + if "node" in config: + node_config = config["node"] + if not node_config: + node_config = {} + node = Local( + load_curve=node_config.get("load_curve", default_config["load_curve"]), + iterations=node_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(node_config.get("mount_dir", default_config["mount_dir"])) + ) print(node) - if not process_config: - process_config = {} - process = LocalProcess( - isolated_cpu=process_config.get("isolated_cpu", default_config["isolated_cpu"]), - load_curve=process_config.get("load_curve", default_config["load_curve"]), - iterations=process_config.get("iterations", default_config["iterations"]), - mount_dir=os.path.expanduser(process_config.get("mount_dir", default_config["mount_dir"])) - ) + + process = None + if "process" in config: + process_config = config["process"] + if not process_config: + process_config = {} + process = LocalProcess( + isolated_cpu=process_config.get("isolated_cpu", default_config["isolated_cpu"]), + load_curve=process_config.get("load_curve", default_config["load_curve"]), + iterations=process_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(process_config.get("mount_dir", default_config["mount_dir"])) + ) print(process) - if not container_config: - container_config = {} - container = LocalContainer( - isolated_cpu=container_config.get("isolated_cpu", default_config["isolated_cpu"]), - container_name=container_config.get("container_name", default_config["container_name"]), - load_curve=container_config.get("load_curve", default_config["load_curve"]), - iterations=container_config.get("iterations", default_config["iterations"]), - mount_dir=os.path.expanduser(container_config.get("mount_dir", default_config["mount_dir"])) - ) + + container = None + if "container" in config: + container_config = config["container"] + if not container_config: + container_config = {} + container = LocalContainer( + isolated_cpu=container_config.get("isolated_cpu", default_config["isolated_cpu"]), + container_name=container_config.get("container_name", default_config["container_name"]), + load_curve=container_config.get("load_curve", default_config["load_curve"]), + iterations=container_config.get("iterations", default_config["iterations"]), + mount_dir=os.path.expanduser(container_config.get("mount_dir", default_config["mount_dir"])) + ) print(container) validations_file = config.get("validations_file", "bm_validations.yaml") - BMValidator( + return BMValidator( log_level=log_level, - prom=prom, + prometheus=prom, node=node, process=process, container=container, diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py index d84a8f0886..c4a9104f5a 100644 --- a/e2e/tools/validator/src/validator/stresser/__init__.py +++ b/e2e/tools/validator/src/validator/stresser/__init__.py @@ -127,7 +127,7 @@ class ProcessOutput(NamedTuple): class ContainerOutput(NamedTuple): - ScriptResult: ScriptResult + script_result: ScriptResult container_id: str diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 21ffe8de01..060068d109 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -1,6 +1,7 @@ import logging import re from typing import Any, NamedTuple +from abc import ABC, abstractmethod import yaml @@ -68,7 +69,7 @@ def yaml_node(yml: dict[str, Any], key_path: list[str], default: Any) -> Any: return node -def read_validations(file_path: str, promql_vars: dict[str, str]) -> list[Validation]: +def read_validations(file_path: str, promql_vars: dict[str, str], level: str = "") -> list[Validation]: with open(file_path) as file: yml = yaml.safe_load(file) global_mapping = yaml_node(yml, ["config", "mapping"], {}) @@ -90,7 +91,50 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation: max_mape=v.get("max_mape"), ) - return [validation_from_yaml(v) for v in yml["validations"]] + return [validation_from_yaml(v) for v in (yml["validations"][level] if level else yml["validations"])] + + +class BLoader: + def __init__(self, cfg: config.BMValidator): + self.cfg = cfg + + def _load_base_promql_vars(self) -> dict[str, str]: + promql_vars = {} + prom = self.cfg.prometheus + promql_vars["rate_interval"] = prom.rate_interval + promql_vars["job"] = prom.job.metal + return promql_vars + + def load_node_validations(self) -> list[Validation]: + promql_vars = self._load_base_promql_vars() + + return read_validations( + self.cfg.validations_file, + promql_vars, + "node" + ) + + def load_process_validations(self, process_pids: list[str]) -> list[Validation]: + promql_vars = self._load_base_promql_vars() + pids = "|".join(map(str, process_pids)) + pid_label = f'pid=~"{pids}"' + promql_vars["target_pids"] = pid_label + + return read_validations( + self.cfg.validations_file, + promql_vars, + "process" + ) + + def load_container_validations(self, container_id: str) -> list[Validation]: + promql_vars = self._load_base_promql_vars() + promql_vars["target_container_id"] = container_id + + return read_validations( + self.cfg.validations_file, + promql_vars, + "container" + ) class Loader: diff --git a/e2e/tools/validator/validator.bm.yaml b/e2e/tools/validator/validator.bm.yaml new file mode 100644 index 0000000000..6d7b071921 --- /dev/null +++ b/e2e/tools/validator/validator.bm.yaml @@ -0,0 +1,25 @@ +log_level: warn # Logging level, defaults is warn + +prometheus: + job: metal # Job name for baremetal metrics + url: http://localhost:9090 # Prometheus server URL + rate_interval: 20s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval + step: 3s # Step duration for Prometheus range queries + +config: # default settings + isolated_cpu: "15" # Logical processor that is fully isolated from scheduler (ex. isolcpus) + load_curve: "0:15,10:20,25:20,50:20,75:20,100:30,75:20,50:20,25:20,10:20,0:15" + iterations: "1" + mount_dir: "/tmp" + container_name: "baremetal-stresser" + +#node: {} or node: or node: ~ +node: + +process: + iterations: "2" + +container: + iterations: "2" + +validations_file: ./bm_validations.yaml # Path to the validations file, default is ./bm_validations.yaml \ No newline at end of file From 0e2bf49030e7e44b18368860fdff350236664b13 Mon Sep 17 00:00:00 2001 From: Kaiyi Date: Tue, 10 Dec 2024 21:01:39 -0500 Subject: [PATCH 4/5] feat(baremetal-validation): Add relevant config files Added relevant config files for baremetal validation including formatted prom validation metrics and baremetal configuration. Signed-off-by: Kaiyi --- e2e/tools/validator/bm_validations.yaml | 231 ++++++++++++++++++ e2e/tools/validator/pyproject.toml | 2 + .../validator/scripts/targeted_stresser.sh | 6 +- .../validator/src/validator/cli/__init__.py | 23 +- .../src/validator/prometheus/__init__.py | 31 ++- .../src/validator/validations/__init__.py | 10 +- e2e/tools/validator/validator.bm.yaml | 14 +- 7 files changed, 291 insertions(+), 26 deletions(-) create mode 100644 e2e/tools/validator/bm_validations.yaml mode change 100644 => 100755 e2e/tools/validator/scripts/targeted_stresser.sh diff --git a/e2e/tools/validator/bm_validations.yaml b/e2e/tools/validator/bm_validations.yaml new file mode 100644 index 0000000000..d35816f825 --- /dev/null +++ b/e2e/tools/validator/bm_validations.yaml @@ -0,0 +1,231 @@ +# metal_job_name: metal +# scaphandre_job_name: scaphandre +# node_exporter_job_name: node_exporter +# remove path if possible + +validations: + process: + # validate process bpf cpu time with node exporter + - name: node-cpu-time - kepler-process-bpf-cpu-time + mapping: + actual: node-cpu-time + predicted: kepler-process-bpf-cpu-time + units: Milliseconds + node-cpu-time: | + sum( + rate( + node_cpu_seconds_total{{ + cpu="{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + kepler-process-bpf-cpu-time: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + # validate kepler bpf cpu time with process exporter (namedgroup must be pid) + # include system and user + - name: scaph-process-cpu-time - kepler-process-bpf-cpu-time + mapping: + actual: scaph-process-cpu-time + predicted: kepler-process-bpf-cpu-time + units: Milliseconds + scaph-process-cpu-time: | + sum( + rate( + namedprocess_namegroup_cpu_seconds_total{{ + groupname=~"{pids}" + }}[{rate_interval}] + ) + ) * 1000 + kepler-process-bpf-cpu-time: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + - name: kepler-process-bpf-cpu-time usage * node-package-power - kepler-process-package-power + mapping: + actual: kepler-process-cpu-ratio-node-package-power + predicted: kepler-process-package-power + units: Watts + kepler-process-cpu-ratio-node-package-power: | + ( + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) / + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-process-package-power: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + - name: node-exporter-cpu usage * node-package-power - kepler-process-package-power + mapping: + actual: kepler-process-cpu-ratio-node-package-power + predicted: kepler-process-package-power + units: Watts + kepler-process-cpu-ratio-node-package-power: | + ( + ( + sum( + rate( + node_cpu_seconds_total{{ + cpu=~"{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + ) / + ( + sum( + rate( + node_cpu_seconds_total{{ + mode!="idle", + }}[{rate_interval}] + ) + ) * 1000 + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-process-package-power: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + + container: + # validate container bpf cpu time with node exporter + - name: node-cpu-time - kepler-container-bpf-cpu-time + mapping: + actual: node-cpu-time + predicted: kepler-container-bpf-cpu-time + units: Milliseconds + node-cpu-time: | + sum( + rate( + node_cpu_seconds_total{{ + cpu=~"{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + kepler-container-bpf-cpu-time: | + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) + + - name: kepler-container-bpf-cpu-time usage * node-package-power - kepler-container-package-power + mapping: + actual: kepler-container-cpu-ratio-node-package-power + predicted: kepler-container-package-power + units: Watts + kepler-container-cpu-ratio-node-package-power: | + ( + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) / + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-container-package-power: | + sum( + rate( + kepler_container_package_joules_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) + + node: + # node level package power comparison + - name: node-rapl - kepler-node-package + mapping: + actual: node-rapl + predicted: kepler-node-package + units: Watts + node-rapl: | + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-node-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) \ No newline at end of file diff --git a/e2e/tools/validator/pyproject.toml b/e2e/tools/validator/pyproject.toml index eac8d556f7..b7958f8424 100644 --- a/e2e/tools/validator/pyproject.toml +++ b/e2e/tools/validator/pyproject.toml @@ -32,10 +32,12 @@ dependencies = [ "matplotlib", "scikit-learn", "docker", + "psutil", ] [project.scripts] validator = "validator.cli:validator" +bm_validator = "validator.cli:bm_validator" [tool.hatch.version] path = "src/validator/__about__.py" diff --git a/e2e/tools/validator/scripts/targeted_stresser.sh b/e2e/tools/validator/scripts/targeted_stresser.sh old mode 100644 new mode 100755 index 3ba379f2c8..9093516d3f --- a/e2e/tools/validator/scripts/targeted_stresser.sh +++ b/e2e/tools/validator/scripts/targeted_stresser.sh @@ -64,14 +64,16 @@ main() { start_time=$(date +%s) echo "Stress Start Time: $start_time" >> "$TIME_INTERVAL_LOG" - + local all_cpus + all_cpus=$(nproc) for i in $(seq 1 "$iterations"); do echo "Running $i/$iterations" for x in "${load_curve[@]}"; do local load="${x%%:*}" local time="${x##*:}s" if $set_general_mode; then - run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" + # replace cpus with all avaialbe cpus with nproc + run stress-ng --cpu "$all_cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" else run taskset -c "$cpu_range" stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" fi diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index a1f608ef34..e2fc0a109b 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -668,8 +668,8 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di type=click.Path(exists=True, dir_okay=True, writable=True), show_default=True, ) -@pass_config -def regression( +@pass_bm_config +def stress( cfg: config.BMValidator, report_dir: str, ): @@ -681,7 +681,8 @@ def regression( click.secho(f"\tresults dir: {results_dir}, tag: {tag}", fg="bright_green") res = TestResult(tag) res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) - res.start_time = datetime.datetime.now() + test_start_time = datetime.datetime.now() + res.start_time = test_start_time click.secho(" * Generating spec report ...", fg="green") res.host_spec = get_host_spec() validation_results = [] @@ -694,7 +695,7 @@ def regression( local_stress_test = local_stress.stress() start_time = local_stress_test.start_time end_time = local_stress_test.end_time - + print(f"node: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -714,7 +715,7 @@ def regression( start_time = process_stress_test.script_result.start_time end_time = process_stress_test.script_result.end_time relevant_pids = process_stress_test.relevant_pids - + print(f"process: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -722,7 +723,6 @@ def regression( prom = PrometheusClient(cfg.prometheus) comparator = Comparator(prom) validations = BLoader(cfg).load_process_validations(relevant_pids) - validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) if cfg.container: @@ -734,7 +734,7 @@ def regression( start_time = container_stress_test.script_result.start_time end_time = container_stress_test.script_result.end_time container_id = container_stress_test.container_id - + print(f"container: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -745,9 +745,14 @@ def regression( validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) - res.end_time = datetime.datetime.now() + test_end_time = datetime.datetime.now() + res.end_time = test_end_time - res.validations = validation_results + res.validations = ValidationResults( + started_at=test_start_time, + ended_at=test_end_time, + results=validation_results + ) write_json_report(results_dir, res) write_md_report(results_dir, res) diff --git a/e2e/tools/validator/src/validator/prometheus/__init__.py b/e2e/tools/validator/src/validator/prometheus/__init__.py index df6c8b5f49..6cd87d94f7 100644 --- a/e2e/tools/validator/src/validator/prometheus/__init__.py +++ b/e2e/tools/validator/src/validator/prometheus/__init__.py @@ -133,7 +133,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: """ filtered_a = [] - filterd_b = [] + filtered_b = [] idx_a, idx_b = 0, 0 @@ -144,7 +144,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: while idx_a < len(a.samples) and idx_b < len(b.samples): if abs(b.samples[idx_b].timestamp - a.samples[idx_a].timestamp) < scrape_interval: filtered_a.append(a.samples[idx_a]) - filterd_b.append(b.samples[idx_b]) + filtered_b.append(b.samples[idx_b]) idx_a += 1 idx_b += 1 elif a.samples[idx_a].timestamp < b.samples[idx_b].timestamp: @@ -154,7 +154,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: return ( Series.from_samples(a.query, filtered_a, a.labels), - Series.from_samples(b.query, filterd_b, b.labels), + Series.from_samples(b.query, filtered_b, b.labels), ) @@ -208,11 +208,31 @@ def kepler_node_info(self) -> list[str]: labels = [r["metric"] for r in resp] return [to_metric(b) for b in labels] +# Add Interface for Comparator +# class Comparator(ABC): +# def single_series(self, query: str, start: datetime, end: datetime) -> Series: +# series = self.prom_client.range_query(query, start, end) + +# if len(series) != 1: +# raise SeriesError(query, 1, len(series)) + +# return series[0] + +# @abstractmethod +# def compare( +# self, +# start: datetime, +# end: datetime, +# actual_query: str, +# predicted_query: str, +# ) -> Result: +# raise NotImplementedError + class Comparator: def __init__(self, client: Queryable): self.prom_client = client - + def single_series(self, query: str, start: datetime, end: datetime) -> Series: series = self.prom_client.range_query(query, start, end) @@ -234,7 +254,6 @@ def compare( actual, predicted = filter_by_equal_timestamps(actual_series, predicted_series) actual_dropped = len(actual_series.samples) - len(actual.samples) predicted_dropped = len(predicted_series.samples) - len(predicted.samples) - return Result( mse=mse(actual.values, predicted.values), mape=mape(actual.values, predicted.values), @@ -243,4 +262,4 @@ def compare( predicted_series=predicted_series, actual_dropped=actual_dropped, predicted_dropped=predicted_dropped, - ) + ) \ No newline at end of file diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 060068d109..bc04683af7 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -102,7 +102,7 @@ def _load_base_promql_vars(self) -> dict[str, str]: promql_vars = {} prom = self.cfg.prometheus promql_vars["rate_interval"] = prom.rate_interval - promql_vars["job"] = prom.job.metal + promql_vars["metal_job_name"] = prom.job.metal return promql_vars def load_node_validations(self) -> list[Validation]: @@ -117,8 +117,9 @@ def load_node_validations(self) -> list[Validation]: def load_process_validations(self, process_pids: list[str]) -> list[Validation]: promql_vars = self._load_base_promql_vars() pids = "|".join(map(str, process_pids)) - pid_label = f'pid=~"{pids}"' - promql_vars["target_pids"] = pid_label + #pid_label = f'pid=~"{pids}"' + promql_vars["pids"] = pids + promql_vars["isolated_cpu"] = self.cfg.process.isolated_cpu return read_validations( self.cfg.validations_file, @@ -128,7 +129,8 @@ def load_process_validations(self, process_pids: list[str]) -> list[Validation]: def load_container_validations(self, container_id: str) -> list[Validation]: promql_vars = self._load_base_promql_vars() - promql_vars["target_container_id"] = container_id + promql_vars["container_id"] = container_id + promql_vars["isolated_cpu"] = self.cfg.container.isolated_cpu return read_validations( self.cfg.validations_file, diff --git a/e2e/tools/validator/validator.bm.yaml b/e2e/tools/validator/validator.bm.yaml index 6d7b071921..41d61f4337 100644 --- a/e2e/tools/validator/validator.bm.yaml +++ b/e2e/tools/validator/validator.bm.yaml @@ -2,8 +2,8 @@ log_level: warn # Logging level, defaults is warn prometheus: job: metal # Job name for baremetal metrics - url: http://localhost:9090 # Prometheus server URL - rate_interval: 20s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval + url: http://localhost:9091 # Prometheus server URL + rate_interval: 21s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval step: 3s # Step duration for Prometheus range queries config: # default settings @@ -15,11 +15,15 @@ config: # default settings #node: {} or node: or node: ~ node: - + load_curve: "0:20,50:20,100:20,50:20,0:20" process: - iterations: "2" + #iterations: "2" + isolated_cpu: "5" + load_curve: "0:20,50:20,100:20,50:20,0:20" container: - iterations: "2" + #iterations: "2" + isolated_cpu: "5" + load_curve: "0:20,50:20,100:20,50:20,0:20" validations_file: ./bm_validations.yaml # Path to the validations file, default is ./bm_validations.yaml \ No newline at end of file From 3ecbcf5394a9dd31853f85936f56475e3ab0b837 Mon Sep 17 00:00:00 2001 From: Kaiyi Liu Date: Thu, 12 Dec 2024 13:46:49 -0500 Subject: [PATCH 5/5] feat(bm-validation): Add bm_validations.yaml configuration Added most important component rapl metrics to validate on bm Signed-off-by: Kaiyi Liu --- e2e/tools/validator/bm_validations.yaml | 174 +++++++++++++++++++----- 1 file changed, 138 insertions(+), 36 deletions(-) diff --git a/e2e/tools/validator/bm_validations.yaml b/e2e/tools/validator/bm_validations.yaml index d35816f825..1f702a4a4a 100644 --- a/e2e/tools/validator/bm_validations.yaml +++ b/e2e/tools/validator/bm_validations.yaml @@ -4,6 +4,135 @@ # remove path if possible validations: + node: + - name: node-rapl-package - kepler-node-package + mapping: + actual: node-rapl-package + predicted: kepler-node-package + units: Watts + node-rapl-package: | + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-node-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + + - name: node-rapl-package - sum-kepler-process-package + mapping: + actual: node-rapl-package + predicted: sum-kepler-process-package + units: Watts + node-rapl-package: | + sum( + rate( + node_rapl_package_joules_total[{rate_interval}] + ) + ) + sum-kepler-process-package: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + + - name: sum-kepler-process-package - kepler-node-package + mapping: + actual: sum-kepler-process-package + predicted: kepler-node-package + units: Watts + sum-kepler-process-package: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + kepler-node-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + + - name: node-rapl-core - kepler-node-core + mapping: + actual: node-rapl-core + predicted: kepler-node-core + units: Watts + node-rapl-core: | + sum( + rate( + node_rapl_core_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-node-core: | + sum( + rate( + kepler_node_core_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + + - name: node-rapl-core - sum-kepler-process-core + mapping: + actual: node-rapl-core + predicted: sum-kepler-process-core + units: Watts + node-rapl-core: | + sum( + rate( + node_rapl_core_joules_total[{rate_interval}] + ) + ) + sum-kepler-process-core: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + + - name: sum-kepler-process-core - kepler-node-core + mapping: + actual: sum-kepler-process-core + predicted: kepler-node-core + units: Watts + sum-kepler-process-core: | + sum( + rate( + kepler_process_core_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + kepler-node-core: | + sum( + rate( + kepler_node_core_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + process: # validate process bpf cpu time with node exporter - name: node-cpu-time - kepler-process-bpf-cpu-time @@ -32,12 +161,12 @@ validations: # validate kepler bpf cpu time with process exporter (namedgroup must be pid) # include system and user - - name: scaph-process-cpu-time - kepler-process-bpf-cpu-time + - name: process-exporter-process-cpu-time - kepler-process-bpf-cpu-time mapping: - actual: scaph-process-cpu-time + actual: process-exporter-process-cpu-time predicted: kepler-process-bpf-cpu-time units: Milliseconds - scaph-process-cpu-time: | + process-exporter-process-cpu-time: | sum( rate( namedprocess_namegroup_cpu_seconds_total{{ @@ -80,9 +209,7 @@ validations: ) * sum( rate( - node_rapl_package_joules_total{{ - path="/host/sys/class/powercap/intel-rapl:0" - }}[{rate_interval}] + node_rapl_package_joules_total[{rate_interval}] ) ) kepler-process-package-power: | @@ -124,9 +251,7 @@ validations: ) * sum( rate( - node_rapl_package_joules_total{{ - path="/host/sys/class/powercap/intel-rapl:0" - }}[{rate_interval}] + node_rapl_package_joules_total[{rate_interval}] ) ) kepler-process-package-power: | @@ -183,7 +308,7 @@ validations: ) / sum( rate( - kepler_container_bpf_cpu_time_ms_total{{ + kepler_process_bpf_cpu_time_ms_total{{ job="{metal_job_name}", }}[{rate_interval}] ) @@ -191,9 +316,7 @@ validations: ) * sum( rate( - node_rapl_package_joules_total{{ - path="/host/sys/class/powercap/intel-rapl:0" - }}[{rate_interval}] + node_rapl_package_joules_total[{rate_interval}] ) ) kepler-container-package-power: | @@ -206,26 +329,5 @@ validations: ) ) - node: - # node level package power comparison - - name: node-rapl - kepler-node-package - mapping: - actual: node-rapl - predicted: kepler-node-package - units: Watts - node-rapl: | - sum( - rate( - node_rapl_package_joules_total{{ - path="/host/sys/class/powercap/intel-rapl:0" - }}[{rate_interval}] - ) - ) - kepler-node-package: | - sum( - rate( - kepler_node_package_joules_total{{ - job="{metal_job_name}", - }}[{rate_interval}] - ) - ) \ No newline at end of file +# node rapl {package/core} = kepler node {package/core} +# node \ No newline at end of file