Skip to content

Commit 10766e0

Browse files
authored
Merge pull request #1348 from pyrra-dev/absent-duration
Improve Absent alert duration
2 parents d268acb + 69adb19 commit 10766e0

File tree

4 files changed

+57
-52
lines changed

4 files changed

+57
-52
lines changed

kubernetes/controllers/servicelevelobjective_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func Test_makePrometheusRule(t *testing.T) {
9999
{
100100
Alert: "SLOMetricAbsent",
101101
Expr: intstr.FromString(`absent(http_requests_total{job="app"}) == 1`),
102-
For: monitoringDuration("2m"),
102+
For: monitoringDuration("6m"),
103103
Annotations: map[string]string{
104104
"description": "foo",
105105
},
@@ -210,7 +210,7 @@ func Test_makeConfigMap(t *testing.T) {
210210
annotations:
211211
description: foo
212212
expr: absent(http_requests_total{job="app"}) == 1
213-
for: 2m
213+
for: 6m
214214
labels:
215215
job: app
216216
severity: critical

slo/rules.go

+15-25
Original file line numberDiff line numberDiff line change
@@ -713,11 +713,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
713713
}.replace(expr)
714714

715715
rules = append(rules, monitoringv1.Rule{
716-
Alert: o.AlertNameAbsent(),
717-
Expr: intstr.FromString(expr.String()),
718-
For: monitoringDuration(model.Duration(
719-
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
720-
).String()),
716+
Alert: o.AlertNameAbsent(),
717+
Expr: intstr.FromString(expr.String()),
718+
For: monitoringDuration(o.AbsentDuration().String()),
721719
Labels: alertLabels,
722720
Annotations: o.commonRuleAnnotations(),
723721
})
@@ -755,11 +753,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
755753
}.replace(expr)
756754

757755
rules = append(rules, monitoringv1.Rule{
758-
Alert: o.AlertNameAbsent(),
759-
Expr: intstr.FromString(expr.String()),
760-
For: monitoringDuration(model.Duration(
761-
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
762-
).String()),
756+
Alert: o.AlertNameAbsent(),
757+
Expr: intstr.FromString(expr.String()),
758+
For: monitoringDuration(o.AbsentDuration().String()),
763759
Labels: alertLabels,
764760
Annotations: o.commonRuleAnnotations(),
765761
})
@@ -867,11 +863,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
867863
alertLabels["severity"] = string(critical)
868864

869865
rules = append(rules, monitoringv1.Rule{
870-
Alert: o.AlertNameAbsent(),
871-
Expr: intstr.FromString(expr.String()),
872-
For: monitoringDuration(model.Duration(
873-
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
874-
).String()),
866+
Alert: o.AlertNameAbsent(),
867+
Expr: intstr.FromString(expr.String()),
868+
For: monitoringDuration(o.AbsentDuration().String()),
875869
Labels: alertLabels,
876870
Annotations: o.commonRuleAnnotations(),
877871
})
@@ -894,11 +888,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
894888
alertLabelsLe["severity"] = string(critical)
895889

896890
rules = append(rules, monitoringv1.Rule{
897-
Alert: o.AlertNameAbsent(),
898-
Expr: intstr.FromString(expr.String()),
899-
For: monitoringDuration(model.Duration(
900-
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
901-
).String()),
891+
Alert: o.AlertNameAbsent(),
892+
Expr: intstr.FromString(expr.String()),
893+
For: monitoringDuration(o.AbsentDuration().String()),
902894
Labels: alertLabelsLe,
903895
Annotations: o.commonRuleAnnotations(),
904896
})
@@ -1037,11 +1029,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) {
10371029
alertLabels["severity"] = string(critical)
10381030

10391031
rules = append(rules, monitoringv1.Rule{
1040-
Alert: o.AlertNameAbsent(),
1041-
Expr: intstr.FromString(expr.String()),
1042-
For: monitoringDuration(model.Duration(
1043-
(time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute),
1044-
).String()),
1032+
Alert: o.AlertNameAbsent(),
1033+
Expr: intstr.FromString(expr.String()),
1034+
For: monitoringDuration(o.AbsentDuration().String()),
10451035
Labels: alertLabels,
10461036
Annotations: o.commonRuleAnnotations(),
10471037
})

slo/rules_test.go

+25-25
Original file line numberDiff line numberDiff line change
@@ -1263,7 +1263,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
12631263
}, {
12641264
Alert: "SLOMetricAbsent",
12651265
Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`),
1266-
For: monitoringDuration("2m"),
1266+
For: monitoringDuration("10m"),
12671267
Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors", "severity": "critical"},
12681268
}},
12691269
},
@@ -1280,7 +1280,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
12801280
}, {
12811281
Alert: "SLOMetricAbsent",
12821282
Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`),
1283-
For: monitoringDuration("2m"),
1283+
For: monitoringDuration("10m"),
12841284
Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"},
12851285
}},
12861286
},
@@ -1297,7 +1297,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
12971297
}, {
12981298
Alert: "SLOMetricAbsent",
12991299
Expr: intstr.FromString(`absent(http_requests_total{handler=~"/api.*",job="thanos-receive-default"}) == 1`),
1300-
For: monitoringDuration("2m"),
1300+
For: monitoringDuration("10m"),
13011301
Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"},
13021302
}},
13031303
},
@@ -1314,7 +1314,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
13141314
}, {
13151315
Alert: "SLOMetricAbsent",
13161316
Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
1317-
For: monitoringDuration("2m"),
1317+
For: monitoringDuration("3m"),
13181318
Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "job": "api", "slo": "monitoring-grpc-errors", "severity": "critical"},
13191319
}},
13201320
},
@@ -1331,7 +1331,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
13311331
}, {
13321332
Alert: "SLOMetricAbsent",
13331333
Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
1334-
For: monitoringDuration("2m"),
1334+
For: monitoringDuration("3m"),
13351335
Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "slo": "monitoring-grpc-errors", "severity": "critical"},
13361336
}},
13371337
},
@@ -1352,12 +1352,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
13521352
}, {
13531353
Alert: "SLOMetricAbsent",
13541354
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
1355-
For: monitoringDuration("2m"),
1355+
For: monitoringDuration("6m"),
13561356
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "severity": "critical"},
13571357
}, {
13581358
Alert: "SLOMetricAbsent",
13591359
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
1360-
For: monitoringDuration("2m"),
1360+
For: monitoringDuration("6m"),
13611361
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
13621362
}},
13631363
},
@@ -1375,7 +1375,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
13751375
Record: "http_request_duration_seconds:increase4w",
13761376
Expr: intstr.FromString(`histogram_fraction(0, 1, increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w])) * histogram_count(increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w]))`),
13771377
Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1"},
1378-
//}, {
1378+
// }, {
13791379
// Alert: "SLOMetricAbsent",
13801380
// Expr: intstr.FromString(`absent(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
13811381
// For: monitoringDuration("2m"),
@@ -1399,12 +1399,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
13991399
}, {
14001400
Alert: "SLOMetricAbsent",
14011401
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`),
1402-
For: monitoringDuration("2m"),
1402+
For: monitoringDuration("6m"),
14031403
Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"},
14041404
}, {
14051405
Alert: "SLOMetricAbsent",
14061406
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
1407-
For: monitoringDuration("2m"),
1407+
For: monitoringDuration("6m"),
14081408
Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
14091409
}},
14101410
},
@@ -1425,12 +1425,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
14251425
}, {
14261426
Alert: "SLOMetricAbsent",
14271427
Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default"}) == 1`),
1428-
For: monitoringDuration("2m"),
1428+
For: monitoringDuration("6m"),
14291429
Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"},
14301430
}, {
14311431
Alert: "SLOMetricAbsent",
14321432
Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default",le="1"}) == 1`),
1433-
For: monitoringDuration("2m"),
1433+
For: monitoringDuration("6m"),
14341434
Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"},
14351435
}},
14361436
},
@@ -1451,12 +1451,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
14511451
}, {
14521452
Alert: "SLOMetricAbsent",
14531453
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
1454-
For: monitoringDuration("1m"),
1454+
For: monitoringDuration("2m"),
14551455
Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"},
14561456
}, {
14571457
Alert: "SLOMetricAbsent",
14581458
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`),
1459-
For: monitoringDuration("1m"),
1459+
For: monitoringDuration("2m"),
14601460
Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"},
14611461
}},
14621462
},
@@ -1477,12 +1477,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
14771477
}, {
14781478
Alert: "SLOMetricAbsent",
14791479
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`),
1480-
For: monitoringDuration("1m"),
1480+
For: monitoringDuration("2m"),
14811481
Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"},
14821482
}, {
14831483
Alert: "SLOMetricAbsent",
14841484
Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`),
1485-
For: monitoringDuration("1m"),
1485+
For: monitoringDuration("2m"),
14861486
Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"},
14871487
}},
14881488
},
@@ -1499,7 +1499,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
14991499
}, {
15001500
Alert: "SLOMetricAbsent",
15011501
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`),
1502-
For: monitoringDuration("1m"),
1502+
For: monitoringDuration("5m"),
15031503
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
15041504
}, {
15051505
Record: "prometheus_operator_reconcile_errors:increase2w",
@@ -1508,7 +1508,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
15081508
}, {
15091509
Alert: "SLOMetricAbsent",
15101510
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`),
1511-
For: monitoringDuration("1m"),
1511+
For: monitoringDuration("5m"),
15121512
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
15131513
}},
15141514
},
@@ -1525,7 +1525,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
15251525
}, {
15261526
Alert: "SLOMetricAbsent",
15271527
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`),
1528-
For: monitoringDuration("1m"),
1528+
For: monitoringDuration("5m"),
15291529
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
15301530
}, {
15311531
Record: "prometheus_operator_reconcile_errors:increase2w",
@@ -1534,7 +1534,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
15341534
}, {
15351535
Alert: "SLOMetricAbsent",
15361536
Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`),
1537-
For: monitoringDuration("1m"),
1537+
For: monitoringDuration("5m"),
15381538
Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"},
15391539
}},
15401540
},
@@ -1551,7 +1551,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
15511551
}, {
15521552
Alert: "APIServerMetricAbsent",
15531553
Expr: intstr.FromString(`absent(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}) == 1`),
1554-
For: monitoringDuration("1m"),
1554+
For: monitoringDuration("5m"),
15551555
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-write-response-errors", "severity": "critical"},
15561556
}},
15571557
},
@@ -1572,12 +1572,12 @@ func TestObjective_IncreaseRules(t *testing.T) {
15721572
}, {
15731573
Alert: "SLOMetricAbsent",
15741574
Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_count{job="apiserver",resource=~"resource|",verb=~"LIST|GET"}) == 1`),
1575-
For: monitoringDuration("1m"),
1575+
For: monitoringDuration("5m"),
15761576
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "severity": "critical"},
15771577
}, {
15781578
Alert: "SLOMetricAbsent",
15791579
Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",resource=~"resource|",verb=~"LIST|GET"}) == 1`),
1580-
For: monitoringDuration("1m"),
1580+
For: monitoringDuration("5m"),
15811581
Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "le": "0.1", "severity": "critical"},
15821582
}},
15831583
},
@@ -1598,7 +1598,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
15981598
}, {
15991599
Alert: "SLOMetricAbsent",
16001600
Expr: intstr.FromString(`absent(up) == 1`),
1601-
For: monitoringDuration("2m"),
1601+
For: monitoringDuration("10m"),
16021602
Labels: map[string]string{"severity": "critical", "slo": "up-targets"},
16031603
}},
16041604
},
@@ -1619,7 +1619,7 @@ func TestObjective_IncreaseRules(t *testing.T) {
16191619
}, {
16201620
Alert: "SLOMetricAbsent",
16211621
Expr: intstr.FromString(`absent(up{instance!~"(127.0.0.1|localhost).*"}) == 1`),
1622-
For: monitoringDuration("2m"),
1622+
For: monitoringDuration("10m"),
16231623
Labels: map[string]string{"severity": "critical", "slo": "up-targets"},
16241624
}},
16251625
},

slo/slo.go

+15
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,21 @@ func (o Objective) Exhausts(factor float64) model.Duration {
5555
return model.Duration(time.Second * time.Duration(time.Duration(o.Window).Seconds()/factor))
5656
}
5757

58+
// AbsentDuration calculates the duration when absent alerts should fire.
59+
// The idea is as follows: Use the most critical of the multi burn rate alerts.
60+
// For that alert to fire, both the short AND long windows have to be above the threshold.
61+
// The long window takes the - longest - to fire.
62+
// Assuming absence of the metric means 100% error rate,
63+
// the time it takes to fire is the duration for the long window to go above the threshold (factor * objective).
64+
// Finally, we add the "for" duration we add to the multi burn rate alerts.
65+
func (o Objective) AbsentDuration() model.Duration {
66+
mostCritical := o.Windows()[0]
67+
mostCriticalThreshold := mostCritical.Factor * (1 - o.Target)
68+
mostCriticalDuration := time.Duration(mostCriticalThreshold*mostCritical.Long.Seconds()) * time.Second
69+
mostCriticalDuration += mostCritical.For
70+
return model.Duration(mostCriticalDuration.Round(time.Minute))
71+
}
72+
5873
type IndicatorType int
5974

6075
const (

0 commit comments

Comments
 (0)