From 575b484ba0e807a400e1b3d6329074196ebaf44b Mon Sep 17 00:00:00 2001 From: Sebastian Gaiser Date: Wed, 29 Jan 2025 12:58:35 +0000 Subject: [PATCH] fix(alerts): set severity of 'ectdMembersDown' from 'critical' to 'warning' Downgraded severity of 'etcdMembersDown' from 'critical' to 'warning' as a single etcd member being not available should not be a problem for etcd's quorum. If the quorum would not be fulfilled, 'etcdInsufficientMembers' should fire. In addition the 'for' interval was extended from '10m' to '20m' as e.g. a node reboot with a big physical node takes usually longer than 10 minutes. Signed-off-by: Sebastian Gaiser --- contrib/mixin/alerts/alerts.libsonnet | 4 ++-- contrib/mixin/test.yaml | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/contrib/mixin/alerts/alerts.libsonnet b/contrib/mixin/alerts/alerts.libsonnet index 856fe4c1e38..0a41632e677 100644 --- a/contrib/mixin/alerts/alerts.libsonnet +++ b/contrib/mixin/alerts/alerts.libsonnet @@ -16,9 +16,9 @@ ) > 0 ||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 }, - 'for': '10m', + 'for': '20m', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel, diff --git a/contrib/mixin/test.yaml b/contrib/mixin/test.yaml index bfb50d8af3c..718eeba7a70 100644 --- a/contrib/mixin/test.yaml +++ b/contrib/mixin/test.yaml @@ -5,24 +5,24 @@ tests: - interval: 1m input_series: - series: up{job="etcd",instance="10.10.10.0"} - values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 + values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - series: up{job="etcd",instance="10.10.10.1"} - values: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 + values: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - series: up{job="etcd",instance="10.10.10.2"} - values: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 + values: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 alert_rule_test: - eval_time: 3m alertname: etcdInsufficientMembers - eval_time: 5m alertname: etcdInsufficientMembers - - eval_time: 12m + - eval_time: 22m alertname: etcdMembersDown - - eval_time: 14m + - eval_time: 24m alertname: etcdMembersDown exp_alerts: - exp_labels: job: etcd - severity: critical + severity: warning exp_annotations: description: 'etcd cluster "etcd": members are down (3).' summary: etcd cluster members are down. @@ -55,30 +55,30 @@ tests: - series: up{job="etcd",instance="10.10.10.2"} values: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 alert_rule_test: - - eval_time: 14m + - eval_time: 24m alertname: etcdMembersDown exp_alerts: - exp_labels: job: etcd - severity: critical + severity: warning exp_annotations: description: 'etcd cluster "etcd": members are down (3).' summary: etcd cluster members are down. - interval: 1m input_series: - series: up{job="etcd",instance="10.10.10.0"} - values: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 + values: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 - series: up{job="etcd",instance="10.10.10.1"} - values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 + values: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 - series: etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"} - values: 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 + values: 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 alert_rule_test: - - eval_time: 13m + - eval_time: 23m alertname: etcdMembersDown exp_alerts: - exp_labels: job: etcd - severity: critical + severity: warning exp_annotations: description: 'etcd cluster "etcd": members are down (1).' summary: etcd cluster members are down.