Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions monitoring/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,18 @@ services:

# Runs on your node(s) and forwards node(host) metrics to Prometheus.
master-nodeexporter:
image: prom/node-exporter:v0.14.0
image: prom/node-exporter:v0.15.2
expose:
- 9100
volumes:
- /proc:/host/proc:ro
- /proc/1/net/dev:/host/proc/net/dev:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($$|/)"'
restart: always
labels:
container_group: monitoring
Expand Down Expand Up @@ -46,7 +55,7 @@ services:

# Storage and search backend. Gets all metrics from cAdvisor and Nodeexporter and is the backend that Grafana runs on.
prometheus:
image: prom/prometheus:v1.7.1
image: prom/prometheus:v2.3.2
expose:
- 9090
volumes:
Expand All @@ -57,14 +66,13 @@ services:
- ./prometheus:/etc/prometheus
restart: always
command:
- "-config.file=/etc/prometheus/prometheus.yml"
- "-storage.local.path=/prometheus"
- "-web.console.libraries=/etc/prometheus/console_libraries"
- "-web.console.templates=/etc/prometheus/consoles"
- "-web.listen-address=:9090"
- "-alertmanager.url=http://alertmanager:9093"
- "-storage.local.memory-chunks=300000"
- "-storage.local.retention=744h"
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--query.lookback-delta=25s"
- "--web.console.libraries=/etc/prometheus/console_libraries"
- "--web.console.templates=/etc/prometheus/consoles"
- "--web.listen-address=:9090"
- "--storage.tsdb.retention=744h"
labels:
container_group: monitoring
logging:
Expand Down
Empty file modified monitoring/prometheus/prometheus.yml
100644 → 100755
Empty file.
18 changes: 10 additions & 8 deletions monitoring/prometheus/rules/alert.rules_container-groups
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@

ALERT monitoring_container_group_missing_members
IF count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 10
FOR 5m
ANNOTATIONS {
summary = "CONTAINER GROUP WARNING: CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'",
description = "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13.",
}
groups:
- name: container_groups
rules:
- alert: monitoring_container_group_missing_members
expr: count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 6
for: 5m
annotations:
summary: "CONTAINER GROUP WARNING CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'"
description: "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13."

78 changes: 37 additions & 41 deletions monitoring/prometheus/rules/alert.rules_containers
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,45 +1,41 @@
groups:
- name: containers
rules:
- alert: high_cpu_usage_on_container
expr: sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50
for: 5m
annotations:
summary: "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
description: "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%."

- alert: elasticsearch_eating_memory
expr: sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000
for: 5m
annotations:
summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."

ALERT high_cpu_usage_on_container
IF sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50
FOR 5m
ANNOTATIONS {
summary = "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
description = "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.",
}
- alert: prometheus_eating_memory
expr: sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000
for: 5m
annotations:
summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."

ALERT elasticsearch_eating_memory
IF sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000
FOR 5m
ANNOTATIONS {
summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
}
- alert: container_eating_memory
expr: sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000
for: 5m
annotations:
summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."

ALERT prometheus_eating_memory
IF sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000
FOR 5m
ANNOTATIONS {
summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
}

ALERT container_eating_memory
IF sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000
FOR 5m
ANNOTATIONS {
summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
}


ALERT container_down
IF (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"})
or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"})
)
FOR 5m
LABELS { severity = "Critical" }
ANNOTATIONS {
summary= "CONTAINER '{{ $labels.name }}' down",
description = "container with name '{{ $labels.name }}' is down for more than 5 minutes"
}
- alert: container_down
expr: (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"}) or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"}))
for: 5m
labels:
severity: Critical
annotations:
summary: "CONTAINER '{{ $labels.name }}' down"
description: "container with name '{{ $labels.name }}' is down for: more than 5 minutes"


119 changes: 59 additions & 60 deletions monitoring/prometheus/rules/alert.rules_nodes
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,60 +1,59 @@

ALERT monitoring_service_down
IF up == 0
FOR 5m
ANNOTATIONS {
summary = "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels.host }}'",
description = "The monitoring service '{{ $labels.job }}' is down.",
}

ALERT high_load_on_node
IF node_load5 > 2
FOR 3m
ANNOTATIONS {
summary = "HIGH LOAD WARINING: NODE '{{ $labels.host }}'",
description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.",
}

ALERT node_running_out_of_memory
IF node_memory_MemAvailable < 1500000000
FOR 5m
ANNOTATIONS {
summary = "LOW MEMORY WARING: NODE '{{ $labels.host }}'",
description = "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB.",
}

ALERT node_running_out_of_disk_space
IF node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000
FOR 5m
ANNOTATIONS {
summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'",
description = "Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB.",
}

ALERT node_low_disk_space
IF 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0
FOR 5m
LABELS { severity = "Warning" }
ANNOTATIONS {
summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'",
description = "Warning, node '{{ $labels.host }}' has less than 10% of free disk space on device {{ $labels.device }} with mount point {{ $labels.mountpoint }}. Available disk space at {{ humanize $value }}%",
}

ALERT node_restarted
IF node_time - node_boot_time < 300
FOR 1m
LABELS { severity = "Warning" }
ANNOTATIONS {
summary = "NODE RESTARTED WARNING: NODE '{{ $labels.host }}'",
description = "Warning: Node '{{ $labels.host }}' was restarted at {{ $value }} seconds ago.",
}

################################### TEST ALERT

ALERT TESTING_high_load_on_node
IF node_load1 > 0
FOR 1s
ANNOTATIONS {
summary = "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels.host }}'",
description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.",
}
groups:
- name: nodes
rules:
- alert: monitoring_service_down
expr: up == 0
for: 5m
annotations:
summary: "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels:.host }}'"
description: "The monitoring service '{{ $labels:.job }}' is down."


- alert: high_load_on_node
expr: node_load5 > 2
for: 3m
annotations:
summary: "HIGH LOAD WARINING: NODE '{{ $labels:.host }}'"
description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}."


- alert: node_running_out_of_memory
expr: node_memory_MemAvailable < 1500000000
for: 5m
annotations:
summary: "LOW MEMORY WARING: NODE '{{ $labels:.host }}'"
description: "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB."


- alert: node_running_out_of_disk_space
expr: node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000
for: 5m
annotations:
summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'"
description: Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB.


- alert: node_low_disk_space
expr: 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0
for: 5m
annotations:
summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'"
description: "Warning, node '{{ $labels:.host }}' has less than 10% of free disk space on device {{ $labels:.device }} with mount point {{ $labels:.mountpoint }}. Available disk space at {{ humanize $value }}%"

- alert: node_restarted
expr: node_time - node_boot_time < 300
for: 1m
annotations:
summary: "NODE RESTARTED WARNING: NODE '{{ $labels:.host }}'"
description: "Warning: Node '{{ $labels:.host }}' was restarted at {{ $value }} seconds ago."


################################### TEST alert:

- alert: TESTING_high_load_on_node
expr: node_load1 > 0
for: 1s
annotations:
summary: "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels:.host }}'"
description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}."

20 changes: 12 additions & 8 deletions monitoring/prometheus/rules/alert.rules_sites
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
ALERT service_endpoint_down
IF probe_success{job='service'} == 0
FOR 3m
LABELS { severity = "Critical" }
ANNOTATIONS {
summary = "ENGOPS SERVICE NON-OPERATIONAL",
description = "Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running",
}
groups:
- name: sites
rules:
- alert: service_endpoint_down
expr: probe_success{job='service'} == 0
for: 3m
labels:
severity: Critical
annotations:
summary: ENGOPS SERVICE NON-OPERATIONAL
description: Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running