-
Notifications
You must be signed in to change notification settings - Fork 3
/
riemann.config
105 lines (87 loc) · 3.46 KB
/
riemann.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
; -*- mode: clojure; -*-
; vim: filetype=clojure
(logging/init {:file "/var/log/riemann/riemann.log"})
(include "alerta.clj")
(:require '[alerta :refer [alert heartbeat]])
; Listen on the local interface over TCP (5555), UDP (5555), and websockets
; (5556)
(let [host "127.0.0.1"]
(tcp-server {:host host})
(udp-server {:host host})
(ws-server {:host host}))
(instrumentation {:enabled? false})
; reap expired events every 10 seconds
(periodically-expire 10 {:keep-keys [:host :service :tags :metric :ttl :index-time]})
; some helpful functions
(defn now []
(Math/floor (unix-time)))
; Alerta configuration
(def alert (alerta/alert {:endpoint "https://alerta-api.herokuapp.com" :api-key "demo-key" :debug true}))
(def heartbeat (alerta/heartbeat {:endpoint "https://alerta-api.herokuapp.com" :api-key "demo-key"}))
; set of severity functions
(defn severity
[severity message & children]
(fn [e] ((apply with {:state severity :description message} children) e)))
(def informational (partial severity "informational"))
(def normal (partial severity "normal"))
(def warning (partial severity "warning"))
(def minor (partial severity "minor"))
(def major (partial severity "major"))
(def critical (partial severity "critical"))
; thresholding
(let [index (default :ttl 900 (index))
dedup-alert (changed-state alert)]
(streams
index)
(streams
(throttle 1 30
heartbeat))
(streams
(expired
prn
(match :service "heartbeat"
(fn [event]
(let [elapsed (- (now) (:metric event))
ttl (:ttl event)]
((with {:event "Heartbeat" :group "Riemann" :metric elapsed}
(critical (str "No heartbeat from host in last " ttl " seconds") dedup-alert)) event))))))
(streams
(where (not (state "expired"))
prn
(match :service "heartbeat"
(fn [event]
(let [elapsed (- (now) (:metric event))
ttl (:ttl event)]
((with {:event "Heartbeat" :group "Riemann" :metric elapsed}
(splitp < elapsed
90 (major "Heartbeat stale by more than 90 seconds" dedup-alert)
60 (minor "Heartbeat stale by more than 60 seconds" dedup-alert)
30 (warning "Heartbeat stale by more than 30 seconds" dedup-alert)
(normal "Hearbeat received within " ttl " seconds and not stale" dedup-alert))) event))))
(match :service "load"
(with {:event "SystemLoad" :group "OS"}
(splitp < metric
0.7 (major "15-min load average is very high" dedup-alert)
0.4 (warning "15-min load average is high" dedup-alert)
(normal "15-min load average is OK" dedup-alert))))
(match :service "cpu"
(with {:event "CpuUtil" :group "OS"}
(splitp < (* metric 100)
99 (critical "CPU utilisation >99%" dedup-alert)
95 (major "CPU utilisation >95%" dedup-alert)
90 (minor "CPU utilisation >90%" dedup-alert)
80 (warning "CPU utilisation >80%" dedup-alert)
(normal "CPU utilisation is OK" dedup-alert))))
(match :service "memory"
(with {:event "MemUtil" :group "OS"}
(splitp < (* metric 100)
90 (major "Memory utilisation >90%" dedup-alert)
75 (warning "Memory utilisation >75%" dedup-alert)
(normal "Memory utilisation is OK" dedup-alert))))
(match :service #"disk"
(with {:event "FsUtil" :group "OS"}
(splitp < (* metric 100)
95 (critical "Filesystem utilisation >95% " dedup-alert)
90 (major "Filesystem utilisation >90%" dedup-alert)
80 (warning "Filesystem utilisation >80%" dedup-alert)
(normal "Filesystem utilisation is OK" dedup-alert)))))))