8bdf3ed090
New version changes: * different alerts format * rewritten storage (some config flags removed) Closes-Bug: PROD-16609 Change-Id: I805fa322e4744e98177d6c3e29589ebc6fb917a2
83 lines
3.6 KiB
YAML
83 lines
3.6 KiB
YAML
{%- if pillar.prometheus is defined %}
|
|
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %}
|
|
server:
|
|
alert:
|
|
{%- if server.get('enabled', False) %}
|
|
{% raw %}
|
|
PrometheusTargetDown:
|
|
if: 'up != 1'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: prometheus
|
|
annotations:
|
|
summary: 'Prometheus endpoint {{ $labels.instance }} down'
|
|
description: 'The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}.'
|
|
{% endraw %}
|
|
{%- if server.version == 1.7 %}
|
|
{% raw %}
|
|
PrometheusRushMode:
|
|
if: 'prometheus_local_storage_rushed_mode != 0'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: prometheus
|
|
annotations:
|
|
summary: 'Prometheus {{ $labels.instance }} in rush mode'
|
|
description: 'The Prometheus {{ $labels.instance }} is in rush mode for 10m.'
|
|
{% endraw %}
|
|
{%- endif %}
|
|
{%- endif %}
|
|
{%- if alertmanager.get('enabled', False) %}
|
|
AlertmanagerNotificationFailed:
|
|
{%- set threshold = monitoring.alertmanager.notification_failed_rate|float %}
|
|
if: >-
|
|
rate(alertmanager_notifications_failed_total[5m]) > {{ threshold }}
|
|
for: 2m
|
|
{%- raw %}
|
|
labels:
|
|
severity: warning
|
|
service: alertmanager
|
|
annotations:
|
|
summary: 'Alertmanager {{ $labels.instance }} failed notifications'
|
|
description: 'Alertmanager {{ $labels.instance }} failed notifications for {{ $labels.integration }} (current value={{ $value }}, threshold={%- endraw %}{{ threshold }})'
|
|
{%- endif %}
|
|
{%- if server.get('config', {}).get('remote_write') %}
|
|
PrometheusRemoteStorageQueue:
|
|
{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
|
|
if: >-
|
|
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
|
|
{%- raw %}
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: prometheus
|
|
annotations:
|
|
summary: 'Prometheus {{ $labels.instance }} remote storage queue is filling'
|
|
description: 'The Prometheus {{ $labels.instance }} remote storage queue almost full (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%)'
|
|
{%- endif %}
|
|
{%- if remote_storage_adapter.get('enabled', False) %}
|
|
RemoteStorageAdapterSendingTooSlow:
|
|
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
|
|
if: >-
|
|
100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > {{ threshold }}
|
|
{% raw %}
|
|
labels:
|
|
severity: warning
|
|
service: remote_storage_adapter
|
|
annotations:
|
|
summary: 'Remote storage adapter too slow on {{ $labels.instance }}'
|
|
description: 'Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
|
|
RemoteStorageAdapterIgnoredTooHigh:
|
|
{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
|
|
if: >-
|
|
100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > {{ threshold }}
|
|
{% raw %}
|
|
labels:
|
|
severity: warning
|
|
service: remote_storage_adapter
|
|
annotations:
|
|
summary: 'Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}'
|
|
description: 'Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
|
|
{%- endif %}
|
|
{%- endif %}
|