Add prometheus alerts
* PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52
This commit is contained in:
parent
20a437a58e
commit
6fce6098d7
2 changed files with 44 additions and 1 deletions
|
@ -24,6 +24,12 @@
|
|||
'sent_vs_received_ratio': 10.0,
|
||||
'ignored_vs_sent_ratio': 5.0,
|
||||
},
|
||||
'alertmanager': {
|
||||
'notification_failed_rate': 0.3
|
||||
},
|
||||
'prometheus': {
|
||||
'remote_storage_queue_full_percent': 75.0,
|
||||
},
|
||||
},
|
||||
}, grain='os_family', merge=salt['pillar.get']('prometheus:monitoring')) %}
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
{%- if pillar.prometheus is defined %}
|
||||
{%- from "prometheus/map.jinja" import server, remote_storage_adapter, monitoring with context %}
|
||||
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %}
|
||||
server:
|
||||
alert:
|
||||
{%- if server.get('enabled', False) %}
|
||||
|
@ -13,8 +13,45 @@ server:
|
|||
annotations:
|
||||
summary: 'Prometheus endpoint {{ $labels.instance }} down'
|
||||
description: 'The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}.'
|
||||
PrometheusRushMode:
|
||||
if: 'prometheus_local_storage_rushed_mode != 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: prometheus
|
||||
annotations:
|
||||
summary: 'Prometheus {{ $labels.instance }} in rush mode'
|
||||
description: 'The Prometheus {{ $labels.instance }} is in rush mode for 10m.'
|
||||
{% endraw %}
|
||||
{%- endif %}
|
||||
{%- if alertmanager.get('enabled', False) %}
|
||||
AlertmanagerNotificationFailed:
|
||||
{%- set threshold = monitoring.alertmanager.notification_failed_rate|float %}
|
||||
if: >-
|
||||
rate(alertmanager_notifications_failed_total[5m]) > {{ threshold }}
|
||||
for: 2m
|
||||
{%- raw %}
|
||||
labels:
|
||||
severity: warning
|
||||
service: alertmanager
|
||||
annotations:
|
||||
summary: 'Alertmanager {{ $labels.instance }} failed notifications'
|
||||
description: 'Alertmanager {{ $labels.instance }} failed notifications for {{ $labels.integration }} (current value={{ $value }}, threshold={%- endraw %}{{ threshold }})'
|
||||
{%- endif %}
|
||||
{%- if server.get('config', {}).get('remote_write') %}
|
||||
PrometheusRemoteStorageQueue:
|
||||
{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
|
||||
if: >-
|
||||
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
|
||||
{%- raw %}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
service: prometheus
|
||||
annotations:
|
||||
summary: 'Prometheus {{ $labels.instance }} remote storage queue is filling'
|
||||
description: 'The Prometheus {{ $labels.instance }} remote storage queue almost full (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%)'
|
||||
{%- endif %}
|
||||
{%- if remote_storage_adapter.get('enabled', False) %}
|
||||
RemoteStorageAdapterSendingTooSlow:
|
||||
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
|
||||
|
|
Loading…
Reference in a new issue