Add Prometheus alerts
Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac
This commit is contained in:
parent
2959ab41f3
commit
5b6b583c42
2 changed files with 44 additions and 1 deletions
|
@ -17,3 +17,17 @@
|
||||||
},
|
},
|
||||||
}, merge=salt['pillar.get']('prometheus:exporters')) %}
|
}, merge=salt['pillar.get']('prometheus:exporters')) %}
|
||||||
{%- do salt['defaults.merge'](exporters, grains.get('prometheus', {}).get('exporters', {})) %}
|
{%- do salt['defaults.merge'](exporters, grains.get('prometheus', {}).get('exporters', {})) %}
|
||||||
|
|
||||||
|
{%- set monitoring = salt['grains.filter_by']({
|
||||||
|
'default': {
|
||||||
|
'remote_storage_adapter': {
|
||||||
|
'sent_vs_received_ratio': 10.0,
|
||||||
|
'ignored_vs_sent_ratio': 5.0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}, grain='os_family', merge=salt['pillar.get']('prometheus:monitoring')) %}
|
||||||
|
|
||||||
|
{% set remote_storage_adapter = salt['grains.filter_by']({
|
||||||
|
'default': {
|
||||||
|
},
|
||||||
|
}, merge=salt['pillar.get']('prometheus:remote_storage_adapter')) %}
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
{% raw %}
|
{%- if pillar.prometheus is defined %}
|
||||||
|
{%- from "prometheus/map.jinja" import server, remote_storage_adapter, monitoring with context %}
|
||||||
server:
|
server:
|
||||||
alert:
|
alert:
|
||||||
|
{%- if server.get('enabled', False) %}
|
||||||
|
{% raw %}
|
||||||
PrometheusTargetDown:
|
PrometheusTargetDown:
|
||||||
if: 'up != 1'
|
if: 'up != 1'
|
||||||
labels:
|
labels:
|
||||||
|
@ -10,3 +13,29 @@ server:
|
||||||
summary: 'Prometheus endpoint {{ $labels.instance }} is down'
|
summary: 'Prometheus endpoint {{ $labels.instance }} is down'
|
||||||
description: 'Prometheus endpoint {{ $labels.instance }} is down for job {{ $labels.job }}'
|
description: 'Prometheus endpoint {{ $labels.instance }} is down for job {{ $labels.job }}'
|
||||||
{% endraw %}
|
{% endraw %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if remote_storage_adapter.get('enabled', False) %}
|
||||||
|
RemoteStorageAdapterSendingTooSlow:
|
||||||
|
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
|
||||||
|
if: >-
|
||||||
|
100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > {{ threshold }}
|
||||||
|
{% raw %}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: remote_storage_adapter
|
||||||
|
annotations:
|
||||||
|
summary: 'Remote storage adapter too slow on {{ $labels.instance }}'
|
||||||
|
description: 'Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
|
||||||
|
RemoteStorageAdapterIgnoredTooHigh:
|
||||||
|
{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
|
||||||
|
if: >-
|
||||||
|
100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > {{ threshold }}
|
||||||
|
{% raw %}
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: remote_storage_adapter
|
||||||
|
annotations:
|
||||||
|
summary: 'Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}'
|
||||||
|
description: 'Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
|
Loading…
Reference in a new issue