formula-prometheus/prometheus/meta/prometheus.yml
Simon Pasquier cd90c9f842 Trigger the target down alert after 2 minutes
Otherwise the alert fires as soon as Prometheus can't scrape a target.
It is too aggressive in case of transient connectivity issues or
endpoint restart.

Change-Id: Ib3de5b141db7a7f2397bf332844a9c44d38f2d3c
2017-09-12 15:14:21 +02:00

42 lines
1.9 KiB
YAML

{%- if pillar.prometheus is defined %}
{%- from "prometheus/map.jinja" import server, remote_storage_adapter, monitoring with context %}
server:
alert:
{%- if server.get('enabled', False) %}
{% raw %}
PrometheusTargetDown:
if: 'up != 1'
for: 2m
labels:
severity: critical
service: prometheus
annotations:
summary: 'Prometheus endpoint {{ $labels.instance }} down'
description: 'The Prometheus target {{ $labels.instance }} is down for the job {{ $labels.job }}.'
{% endraw %}
{%- endif %}
{%- if remote_storage_adapter.get('enabled', False) %}
RemoteStorageAdapterSendingTooSlow:
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
if: >-
100.0 - (100.0 * sent_samples_total{job="remote_storage_adapter"} / on (job, instance) received_samples_total) > {{ threshold }}
{% raw %}
labels:
severity: warning
service: remote_storage_adapter
annotations:
summary: 'Remote storage adapter too slow on {{ $labels.instance }}'
description: 'Remote storage adapter can not ingest samples fast enough on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
RemoteStorageAdapterIgnoredTooHigh:
{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
if: >-
100.0 * prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"} / on (job, instance) sent_samples_total > {{ threshold }}
{% raw %}
labels:
severity: warning
service: remote_storage_adapter
annotations:
summary: 'Remote storage adapter receiving too many invalid metrics on {{ $labels.instance }}'
description: 'Remote storage adapter is receiving too many invalid metrics on {{ $labels.instance }} (current value={{ $value }}%, threshold={%- endraw %}{{ threshold }}%).'
{%- endif %}
{%- endif %}