From 2959ab41f339b5df786ab7def081bb8ed154a320 Mon Sep 17 00:00:00 2001 From: Simon Pasquier Date: Mon, 24 Jul 2017 15:38:28 +0200 Subject: [PATCH 1/2] Rename Prometheus alerts for consistency Change-Id: I96fb789bf73af22d56fc6c6980626647f87409d4 --- prometheus/meta/prometheus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml index eb8df8d..07eb726 100644 --- a/prometheus/meta/prometheus.yml +++ b/prometheus/meta/prometheus.yml @@ -1,7 +1,7 @@ {% raw %} server: alert: - PrometheusUP: + PrometheusTargetDown: if: 'up != 1' labels: severity: critical From 5b6b583c427197806564503c07d3cf55689f55c4 Mon Sep 17 00:00:00 2001 From: Olivier Bourdon Date: Tue, 25 Jul 2017 16:11:43 +0200 Subject: [PATCH 2/2] Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac --- prometheus/map.jinja | 14 ++++++++++++++ prometheus/meta/prometheus.yml | 31 ++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/prometheus/map.jinja b/prometheus/map.jinja index 5a66470..33c6c1a 100644 --- a/prometheus/map.jinja +++ b/prometheus/map.jinja @@ -17,3 +17,17 @@ }, }, merge=salt['pillar.get']('prometheus:exporters')) %} {%- do salt['defaults.merge'](exporters, grains.get('prometheus', {}).get('exporters', {})) %} + +{%- set monitoring = salt['grains.filter_by']({ + 'default': { + 'remote_storage_adapter': { + 'sent_vs_received_ratio': 10.0, + 'ignored_vs_sent_ratio': 5.0, + }, + }, +}, grain='os_family', merge=salt['pillar.get']('prometheus:monitoring')) %} + +{% set remote_storage_adapter = salt['grains.filter_by']({ + 'default': { + }, +}, merge=salt['pillar.get']('prometheus:remote_storage_adapter')) %} diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml index 07eb726..1ef4d26 100644 --- a/prometheus/meta/prometheus.yml +++ b/prometheus/meta/prometheus.yml @@ -1,6 +1,9 @@ -{% raw %} +{%- if pillar.prometheus is defined %} +{%- from "prometheus/map.jinja" import server, remote_storage_adapter, monitoring with context %} server: alert: +{%- if server.get('enabled', False) %} +{% raw %} PrometheusTargetDown: if: 'up != 1' labels: @@ -10,3 +13,29 @@ server: summary: 'Prometheus endpoint {{ $labels.instance }} is down' description: 'Prometheus endpoint {{ $labels.instance }} is down for job {{ $labels.job }}' {% endraw %} +{%- endif %} +{%- if remote_storage_adapter.get('enabled', False) %} + RemoteStorageAdapterSendingTooSlow: + {%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %} + if: >- + 100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > {{ threshold }} +{% raw %} + labels: + severity: warning + service: remote_storage_adapter + annotations: + summary: 'Remote storage adapter too slow on {{ $labels.instance }}' + description: 'Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).' + RemoteStorageAdapterIgnoredTooHigh: + {%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %} + if: >- + 100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > {{ threshold }} +{% raw %} + labels: + severity: warning + service: remote_storage_adapter + annotations: + summary: 'Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}' + description: 'Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).' +{%- endif %} +{%- endif %}