diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 3c83847..009f56a 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -9,9 +9,18 @@ services: # Runs on your node(s) and forwards node(host) metrics to Prometheus. master-nodeexporter: - image: prom/node-exporter:v0.14.0 + image: prom/node-exporter:v0.15.2 expose: - 9100 + volumes: + - /proc:/host/proc:ro + - /proc/1/net/dev:/host/proc/net/dev:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($$|/)"' restart: always labels: container_group: monitoring @@ -46,7 +55,7 @@ services: # Storage and search backend. Gets all metrics from cAdvisor and Nodeexporter and is the backend that Grafana runs on. prometheus: - image: prom/prometheus:v1.7.1 + image: prom/prometheus:v2.3.2 expose: - 9090 volumes: @@ -57,14 +66,13 @@ services: - ./prometheus:/etc/prometheus restart: always command: - - "-config.file=/etc/prometheus/prometheus.yml" - - "-storage.local.path=/prometheus" - - "-web.console.libraries=/etc/prometheus/console_libraries" - - "-web.console.templates=/etc/prometheus/consoles" - - "-web.listen-address=:9090" - - "-alertmanager.url=http://alertmanager:9093" - - "-storage.local.memory-chunks=300000" - - "-storage.local.retention=744h" + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--query.lookback-delta=25s" + - "--web.console.libraries=/etc/prometheus/console_libraries" + - "--web.console.templates=/etc/prometheus/consoles" + - "--web.listen-address=:9090" + - "--storage.tsdb.retention=744h" labels: container_group: monitoring logging: diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml old mode 100644 new mode 100755 diff --git a/monitoring/prometheus/rules/alert.rules_container-groups b/monitoring/prometheus/rules/alert.rules_container-groups old mode 100644 new mode 100755 index d8cf81b..754b7db --- a/monitoring/prometheus/rules/alert.rules_container-groups +++ b/monitoring/prometheus/rules/alert.rules_container-groups @@ -1,8 +1,10 @@ - -ALERT monitoring_container_group_missing_members - IF count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 10 - FOR 5m - ANNOTATIONS { - summary = "CONTAINER GROUP WARNING: CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'", - description = "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13.", - } +groups: +- name: container_groups + rules: + - alert: monitoring_container_group_missing_members + expr: count(rate(container_last_seen{name=~".+",container_group=~"monitoring"}[5m])) by (container_group,host) < 6 + for: 5m + annotations: + summary: "CONTAINER GROUP WARNING CONTAINER GROUP '{{ $labels.container_group }}' on '{{ $labels.host }}'" + description: "{{ $labels.container_group }} is missing containers. Container count is {{ $value }}/13." + diff --git a/monitoring/prometheus/rules/alert.rules_containers b/monitoring/prometheus/rules/alert.rules_containers old mode 100644 new mode 100755 index 41d732f..29d6c4c --- a/monitoring/prometheus/rules/alert.rules_containers +++ b/monitoring/prometheus/rules/alert.rules_containers @@ -1,45 +1,41 @@ +groups: +- name: containers + rules: + - alert: high_cpu_usage_on_container + expr: sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50 + for: 5m + annotations: + summary: "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'" + description: "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%." + + - alert: elasticsearch_eating_memory + expr: sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000 + for: 5m + annotations: + summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'" + description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}." -ALERT high_cpu_usage_on_container - IF sum(rate(container_cpu_usage_seconds_total{name=~".+"}[5m])) by (name,host) * 100 > 50 - FOR 5m - ANNOTATIONS { - summary = "HIGH CPU USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'", - description = "{{ $labels.name }} is using a LOT of CPU. CPU usage is {{ humanize $value}}%.", - } + - alert: prometheus_eating_memory + expr: sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000 + for: 5m + annotations: + summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'" + description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}." -ALERT elasticsearch_eating_memory - IF sum(container_memory_rss{name=~"logging_elasticsearch_1"}) by (host, name) > 1200000000 - FOR 5m - ANNOTATIONS { - summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'", - description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.", - } + - alert: container_eating_memory + expr: sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000 + for: 5m + annotations: + summary: "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'" + description: "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}." -ALERT prometheus_eating_memory - IF sum(container_memory_rss{name=~"monitoring_prometheus_1"}) by (host, name) > 1200000000 - FOR 5m - ANNOTATIONS { - summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'", - description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.", - } - -ALERT container_eating_memory - IF sum(container_memory_rss{name=~".+",name!="logging_elasticsearch_1",name!="monitoring_prometheus_1"}) by (host,name) > 700000000 - FOR 5m - ANNOTATIONS { - summary = "HIGH MEMORY USAGE WARNING: CONTAINER '{{ $labels.name }}' on '{{ $labels.host }}'", - description = "{{ $labels.name }} is eating up a LOT of memory. Memory consumption of {{ $labels.name }} is at {{ humanize $value}}.", - } - - -ALERT container_down -IF (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"}) - or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"}) - ) - FOR 5m - LABELS { severity = "Critical" } - ANNOTATIONS { - summary= "CONTAINER '{{ $labels.name }}' down", - description = "container with name '{{ $labels.name }}' is down for more than 5 minutes" - } + - alert: container_down + expr: (absent(container_memory_usage_bytes{name="logging_elasticsearch_1"}) or absent(container_memory_usage_bytes{name="monitoring_prometheus_1"})) + for: 5m + labels: + severity: Critical + annotations: + summary: "CONTAINER '{{ $labels.name }}' down" + description: "container with name '{{ $labels.name }}' is down for: more than 5 minutes" + diff --git a/monitoring/prometheus/rules/alert.rules_nodes b/monitoring/prometheus/rules/alert.rules_nodes old mode 100644 new mode 100755 index 970b1d5..9c40874 --- a/monitoring/prometheus/rules/alert.rules_nodes +++ b/monitoring/prometheus/rules/alert.rules_nodes @@ -1,60 +1,59 @@ - -ALERT monitoring_service_down - IF up == 0 - FOR 5m - ANNOTATIONS { - summary = "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels.host }}'", - description = "The monitoring service '{{ $labels.job }}' is down.", - } - -ALERT high_load_on_node - IF node_load5 > 2 - FOR 3m - ANNOTATIONS { - summary = "HIGH LOAD WARINING: NODE '{{ $labels.host }}'", - description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.", - } - -ALERT node_running_out_of_memory - IF node_memory_MemAvailable < 1500000000 - FOR 5m - ANNOTATIONS { - summary = "LOW MEMORY WARING: NODE '{{ $labels.host }}'", - description = "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB.", - } - -ALERT node_running_out_of_disk_space - IF node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000 - FOR 5m - ANNOTATIONS { - summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'", - description = "Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB.", - } - -ALERT node_low_disk_space - IF 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0 - FOR 5m - LABELS { severity = "Warning" } - ANNOTATIONS { - summary = "LOW DISK SPACE WARING: NODE '{{ $labels.host }}'", - description = "Warning, node '{{ $labels.host }}' has less than 10% of free disk space on device {{ $labels.device }} with mount point {{ $labels.mountpoint }}. Available disk space at {{ humanize $value }}%", - } - -ALERT node_restarted - IF node_time - node_boot_time < 300 - FOR 1m - LABELS { severity = "Warning" } - ANNOTATIONS { - summary = "NODE RESTARTED WARNING: NODE '{{ $labels.host }}'", - description = "Warning: Node '{{ $labels.host }}' was restarted at {{ $value }} seconds ago.", - } - -################################### TEST ALERT - -ALERT TESTING_high_load_on_node - IF node_load1 > 0 - FOR 1s - ANNOTATIONS { - summary = "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels.host }}'", - description = "{{ $labels.host}} is under high load. Load is {{ humanize $value }}.", - } +groups: +- name: nodes + rules: + - alert: monitoring_service_down + expr: up == 0 + for: 5m + annotations: + summary: "MONITORING SERVICE DOWN WARNING: NODE '{{ $labels:.host }}'" + description: "The monitoring service '{{ $labels:.job }}' is down." + + + - alert: high_load_on_node + expr: node_load5 > 2 + for: 3m + annotations: + summary: "HIGH LOAD WARINING: NODE '{{ $labels:.host }}'" + description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}." + + + - alert: node_running_out_of_memory + expr: node_memory_MemAvailable < 1500000000 + for: 5m + annotations: + summary: "LOW MEMORY WARING: NODE '{{ $labels:.host }}'" + description: "Less than 1.5GB of free memory. Free memory at {{ humanize $value }} GB." + + + - alert: node_running_out_of_disk_space + expr: node_filesystem_free{mountpoint="/etc/hostname"} < 40000000000 + for: 5m + annotations: + summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'" + description: Less than 40GB of free disk space. Free disk space at {{ humanize $value }} GB. + + + - alert: node_low_disk_space + expr: 100 * min(node_filesystem_avail/node_filesystem_size{mountpoint=~"/etc/hostname|^/mnt.*|^/data.*"}) by (device, fstype, host, instance, job) < 10.0 + for: 5m + annotations: + summary: "LOW DISK SPACE WARING: NODE '{{ $labels:.host }}'" + description: "Warning, node '{{ $labels:.host }}' has less than 10% of free disk space on device {{ $labels:.device }} with mount point {{ $labels:.mountpoint }}. Available disk space at {{ humanize $value }}%" + + - alert: node_restarted + expr: node_time - node_boot_time < 300 + for: 1m + annotations: + summary: "NODE RESTARTED WARNING: NODE '{{ $labels:.host }}'" + description: "Warning: Node '{{ $labels:.host }}' was restarted at {{ $value }} seconds ago." + + +################################### TEST alert: + + - alert: TESTING_high_load_on_node + expr: node_load1 > 0 + for: 1s + annotations: + summary: "+++ TESTING ++++ TESTING ++++ TESTING +++ ::: HIGH LOAD WARNING: NODE '{{ $labels:.host }}'" + description: "{{ $labels:.host}} is under high load. Load is {{ humanize $value }}." + diff --git a/monitoring/prometheus/rules/alert.rules_sites b/monitoring/prometheus/rules/alert.rules_sites old mode 100644 new mode 100755 index b362399..8b0c5d8 --- a/monitoring/prometheus/rules/alert.rules_sites +++ b/monitoring/prometheus/rules/alert.rules_sites @@ -1,9 +1,13 @@ -ALERT service_endpoint_down - IF probe_success{job='service'} == 0 - FOR 3m - LABELS { severity = "Critical" } - ANNOTATIONS { - summary = "ENGOPS SERVICE NON-OPERATIONAL", - description = "Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running", - } +groups: +- name: sites + rules: + - alert: service_endpoint_down + expr: probe_success{job='service'} == 0 + for: 3m + labels: + severity: Critical + annotations: + summary: ENGOPS SERVICE NON-OPERATIONAL + description: Service {{ $labels.service_name }} with endpoint {{ $labels.service_url }} is not reachable. Please check whether the instance is running +