From 6fce6098d7afb585e4311c01193b501e1d901bae Mon Sep 17 00:00:00 2001 From: Bartosz Kupidura Date: Mon, 25 Sep 2017 12:52:58 +0200 Subject: [PATCH] Add prometheus alerts * PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52 --- prometheus/map.jinja | 6 ++++++ prometheus/meta/prometheus.yml | 39 +++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/prometheus/map.jinja b/prometheus/map.jinja index 33c6c1a..dbb7803 100644 --- a/prometheus/map.jinja +++ b/prometheus/map.jinja @@ -24,6 +24,12 @@ 'sent_vs_received_ratio': 10.0, 'ignored_vs_sent_ratio': 5.0, }, + 'alertmanager': { + 'notification_failed_rate': 0.3 + }, + 'prometheus': { + 'remote_storage_queue_full_percent': 75.0, + }, }, }, grain='os_family', merge=salt['pillar.get']('prometheus:monitoring')) %} diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml index 07d76bd..89d5014 100644 --- a/prometheus/meta/prometheus.yml +++ b/prometheus/meta/prometheus.yml @@ -1,5 +1,5 @@ {%- if pillar.prometheus is defined %} -{%- from "prometheus/map.jinja" import server, remote_storage_adapter, monitoring with context %} +{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %} server: alert: {%- if server.get('enabled', False) %} @@ -13,8 +13,45 @@ server: annotations: summary: 'Prometheus endpoint {{ $labels.instance }} down' description: 'The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}.' + PrometheusRushMode: + if: 'prometheus_local_storage_rushed_mode != 0' + for: 10m + labels: + severity: warning + service: prometheus + annotations: + summary: 'Prometheus {{ $labels.instance }} in rush mode' + description: 'The Prometheus {{ $labels.instance }} is in rush mode for 10m.' {% endraw %} {%- endif %} +{%- if alertmanager.get('enabled', False) %} + AlertmanagerNotificationFailed: + {%- set threshold = monitoring.alertmanager.notification_failed_rate|float %} + if: >- + rate(alertmanager_notifications_failed_total[5m]) > {{ threshold }} + for: 2m +{%- raw %} + labels: + severity: warning + service: alertmanager + annotations: + summary: 'Alertmanager {{ $labels.instance }} failed notifications' + description: 'Alertmanager {{ $labels.instance }} failed notifications for {{ $labels.integration }} (current value={{ $value }}, threshold={%- endraw %}{{ threshold }})' +{%- endif %} +{%- if server.get('config', {}).get('remote_write') %} + PrometheusRemoteStorageQueue: + {%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %} + if: >- + prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }} +{%- raw %} + for: 2m + labels: + severity: warning + service: prometheus + annotations: + summary: 'Prometheus {{ $labels.instance }} remote storage queue is filling' + description: 'The Prometheus {{ $labels.instance }} remote storage queue almost full (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%)' +{%- endif %} {%- if remote_storage_adapter.get('enabled', False) %} RemoteStorageAdapterSendingTooSlow: {%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}