uschtwill · VladimirDe · Sep 3, 2018 · Sep 13, 2018 · Sep 13, 2018
diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml
@@ -9,9 +9,18 @@ services:
 
   # Runs on your node(s) and forwards node(host) metrics to Prometheus.
   master-nodeexporter:
-    image: prom/node-exporter:v0.14.0
+    image: prom/node-exporter:v0.15.2
     expose:
       - 9100
+    volumes:
+      - /proc:/host/proc:ro
+      - /proc/1/net/dev:/host/proc/net/dev:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($$|/)"'
     restart: always
     labels:
       container_group: monitoring
@@ -46,7 +55,7 @@ services:
 
   # Storage and search backend. Gets all metrics from cAdvisor and Nodeexporter and is the backend that Grafana runs on.
   prometheus:
-    image: prom/prometheus:v1.7.1
+    image: prom/prometheus:v2.3.2
     expose:
       - 9090
     volumes:
@@ -57,14 +66,13 @@ services:
       - ./prometheus:/etc/prometheus
     restart: always
     command:
-      - "-config.file=/etc/prometheus/prometheus.yml"
-      - "-storage.local.path=/prometheus"
-      - "-web.console.libraries=/etc/prometheus/console_libraries"
-      - "-web.console.templates=/etc/prometheus/consoles"
-      - "-web.listen-address=:9090"
-      - "-alertmanager.url=http://alertmanager:9093"
-      - "-storage.local.memory-chunks=300000"
-      - "-storage.local.retention=744h"
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.path=/prometheus"
+      - "--query.lookback-delta=25s"
+      - "--web.console.libraries=/etc/prometheus/console_libraries"
+      - "--web.console.templates=/etc/prometheus/consoles"
+      - "--web.listen-address=:9090"
+      - "--storage.tsdb.retention=744h"
     labels:
       container_group: monitoring
     logging:

diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
diff --git a/monitoring/prometheus/rules/alert.rules_container-groups b/monitoring/prometheus/rules/alert.rules_container-groups
@@ -1,8 +1,10 @@
-
-ALERT monitoring_container_group_missing_members
-  IF count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 10
-  FOR 5m
-  ANNOTATIONS {
-      summary = "CONTAINER GROUP WARNING: CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'",
-      description = "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13.",
-  }
+groups:
+- name: container_groups
+  rules:
+  - alert: monitoring_container_group_missing_members
+    expr: count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 6
+    for: 5m
+    annotations:
+      summary: "CONTAINER GROUP WARNING CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'"
+      description: "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13."
+
diff --git a/monitoring/prometheus/rules/alert.rules_containers b/monitoring/prometheus/rules/alert.rules_containers
@@ -1,45 +1,41 @@
+groups:
+- name: containers
+  rules:
+  - alert: high_cpu_usage_on_container
+    expr: sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50
+    for: 5m
+    annotations:
+      summary: "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
+      description: "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%."
+
+  - alert: elasticsearch_eating_memory
+    expr: sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000
+    for: 5m
+    annotations: 
+      summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
+      description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."
 
-ALERT high_cpu_usage_on_container
-  IF sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50
-  FOR 5m
-  ANNOTATIONS {
-      summary = "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
-      description = "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.",
-  }
+  - alert: prometheus_eating_memory
+    expr: sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000
+    for: 5m
+    annotations:
+      summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
+      description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."
 
-ALERT elasticsearch_eating_memory
-  IF sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000
-  FOR 5m
-  ANNOTATIONS {
-      summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
-      description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
-  }
+  - alert: container_eating_memory
+    expr: sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000
+    for: 5m
+    annotations: 
+      summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'"
+      description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}."
 
-ALERT prometheus_eating_memory
-  IF sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000
-  FOR 5m
-  ANNOTATIONS {
-      summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
-      description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
-  }
-
-ALERT container_eating_memory
-  IF sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000
-  FOR 5m
-  ANNOTATIONS {
-      summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'",
-      description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.",
-  }
-
-
-ALERT container_down
-IF (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"})
-  or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"}) 
-  )
-  FOR 5m
-  LABELS { severity = "Critical" }
-  ANNOTATIONS {
-    summary= "CONTAINER '{{ $labels.name }}' down",
-    description =  "container with  name '{{ $labels.name }}' is down for more than 5 minutes"
-  }
+  - alert: container_down
+    expr: (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"}) or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"}))
+    for: 5m
+    labels: 
+      severity: Critical
+    annotations: 
+      summary: "CONTAINER '{{ $labels.name }}' down"
+      description: "container with  name '{{ $labels.name }}' is down for: more than 5 minutes"
+
 
diff --git a/monitoring/prometheus/rules/alert.rules_nodes b/monitoring/prometheus/rules/alert.rules_nodes
@@ -1,60 +1,59 @@
-
-ALERT monitoring_service_down
-  IF up == 0
-  FOR 5m
-  ANNOTATIONS {
-      summary = "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels.host }}'",
-      description = "The monitoring service '{{ $labels.job }}' is down.",
-  }
-
-ALERT high_load_on_node
-  IF node_load5 > 2
-  FOR 3m
-  ANNOTATIONS {
-      summary = "HIGH LOAD WARINING: NODE '{{ $labels.host }}'",
-      description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.",
-  }
-
-ALERT node_running_out_of_memory
-  IF node_memory_MemAvailable < 1500000000
-  FOR 5m
-  ANNOTATIONS {
-      summary = "LOW MEMORY WARING: NODE '{{ $labels.host }}'",
-      description = "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB.",
-  }
-
-ALERT node_running_out_of_disk_space
-  IF node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000
-  FOR 5m
-  ANNOTATIONS {
-      summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'",
-      description = "Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB.",
-  }
-
-ALERT node_low_disk_space
-  IF 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0
-  FOR 5m
-  LABELS { severity = "Warning" }
-  ANNOTATIONS {
-      summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'",
-      description = "Warning, node '{{ $labels.host }}' has less than 10% of free disk space on device {{ $labels.device }} with mount point {{ $labels.mountpoint }}. Available disk space at {{ humanize $value }}%",
-  }
-
-ALERT node_restarted
-  IF node_time - node_boot_time < 300
-  FOR 1m
-  LABELS { severity = "Warning" }
-  ANNOTATIONS {
-      summary = "NODE RESTARTED WARNING: NODE '{{ $labels.host }}'",
-      description = "Warning: Node '{{ $labels.host }}' was restarted at {{ $value }} seconds ago.",
-  }
-
-################################### TEST ALERT
-
-ALERT TESTING_high_load_on_node
-  IF node_load1 > 0
-  FOR 1s
-  ANNOTATIONS {
-      summary = "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels.host }}'",
-      description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.",
-  }
+groups:
+- name: nodes
+  rules:
+  - alert: monitoring_service_down
+    expr: up == 0
+    for: 5m
+    annotations:
+      summary: "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels:.host }}'"
+      description: "The monitoring service '{{ $labels:.job }}' is down."
+
+
+  - alert: high_load_on_node
+    expr: node_load5 > 2
+    for: 3m
+    annotations:
+      summary: "HIGH LOAD WARINING: NODE '{{ $labels:.host }}'"
+      description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}."
+
+
+  - alert: node_running_out_of_memory
+    expr: node_memory_MemAvailable < 1500000000
+    for: 5m
+    annotations: 
+      summary: "LOW MEMORY WARING: NODE '{{ $labels:.host }}'"
+      description: "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB."
+
+
+  - alert: node_running_out_of_disk_space
+    expr: node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000
+    for: 5m
+    annotations:
+      summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'"
+      description: Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB.
+
+
+  - alert: node_low_disk_space
+    expr: 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0
+    for: 5m
+    annotations:
+      summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'"
+      description: "Warning, node '{{ $labels:.host }}' has less than 10% of free disk space on device {{ $labels:.device }} with mount point {{ $labels:.mountpoint }}. Available disk space at {{ humanize $value }}%"
+
+  - alert: node_restarted
+    expr: node_time - node_boot_time < 300
+    for: 1m
+    annotations: 
+      summary: "NODE RESTARTED WARNING: NODE '{{ $labels:.host }}'"
+      description: "Warning: Node '{{ $labels:.host }}' was restarted at {{ $value }} seconds ago."
+
+
+################################### TEST alert:
+
+  - alert: TESTING_high_load_on_node
+    expr: node_load1 > 0
+    for: 1s
+    annotations: 
+      summary: "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels:.host }}'"
+      description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}."
+
diff --git a/monitoring/prometheus/rules/alert.rules_sites b/monitoring/prometheus/rules/alert.rules_sites
@@ -1,9 +1,13 @@
-ALERT service_endpoint_down
-  IF probe_success{job='service'} == 0
-  FOR 3m
-  LABELS { severity = "Critical" }
-  ANNOTATIONS {
-      summary = "ENGOPS SERVICE NON-OPERATIONAL",
-      description = "Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running",
-  }
+groups:
+- name: sites
+  rules:
+  - alert: service_endpoint_down
+    expr: probe_success{job='service'} == 0
+    for: 3m
+    labels: 
+      severity: Critical
+    annotations:
+      summary: ENGOPS SERVICE NON-OPERATIONAL
+      description: Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running
+