2017-07-25 10:11:43 -04:00
{%- if pillar.prometheus is defined %}
2018-06-18 08:32:59 -04:00
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
2017-04-11 06:17:08 -04:00
server :
alert :
2017-07-25 10:11:43 -04:00
{%- if server.get('enabled', False) %}
{% raw %}
2017-07-24 09:38:28 -04:00
PrometheusTargetDown :
2018-05-03 09:28:45 -04:00
if : up != 1
2017-09-12 09:14:21 -04:00
for : 2m
2017-04-11 06:17:08 -04:00
labels :
severity : critical
service : prometheus
annotations :
2018-05-03 09:28:45 -04:00
summary : "Prometheus target is down"
2018-06-08 09:40:14 -04:00
description : "The Prometheus target for the {{ $labels.job }} job on the {{ $labels.host or $labels.instance }} node is down for 2 minutes."
2018-05-03 09:28:45 -04:00
PrometheusTargetSamplesOrderWarning :
if : increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0
labels :
severity : warning
service : prometheus
annotations :
summary : "Prometheus samples are out of order"
2018-06-08 09:40:14 -04:00
description : "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."
2018-05-03 09:28:45 -04:00
PrometheusTargetSamplesBoundsWarning :
if : increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0
labels :
severity : warning
service : prometheus
annotations :
summary : "Prometheus samples timestamps are out of bounds"
2018-06-08 09:40:14 -04:00
description : "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."
2018-05-03 09:28:45 -04:00
PrometheusTargetSamplesDuplicateWarning :
if : increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0
labels :
severity : warning
service : prometheus
annotations :
summary : "Prometheus samples have duplicate timestamps"
2018-06-08 09:40:14 -04:00
description : "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."
2017-12-20 05:28:41 -05:00
{% endraw %}
{%- if server.version == 1.7 %}
{% raw %}
2018-05-03 09:28:45 -04:00
PrometheusDataIngestionWarning :
if : prometheus_local_storage_rushed_mode != 0
2017-09-25 06:52:58 -04:00
for : 10m
labels :
severity : warning
service : prometheus
annotations :
2018-05-03 09:28:45 -04:00
summary : "Prometheus is in the rushed mode"
2018-06-08 09:40:14 -04:00
description : "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."
2017-04-11 06:17:08 -04:00
{% endraw %}
2017-07-25 10:11:43 -04:00
{%- endif %}
2017-12-20 05:28:41 -05:00
{%- endif %}
2017-09-25 06:52:58 -04:00
{%- if server.get('config', {}).get('remote_write') %}
2018-05-03 09:28:45 -04:00
PrometheusRemoteStorageQueueFullWarning :
2017-09-25 06:52:58 -04:00
{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
if : >-
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
{%- raw %}
for : 2m
labels :
severity : warning
service : prometheus
annotations :
2018-06-08 09:40:14 -04:00
summary : "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"
description : "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."
2018-05-03 09:28:45 -04:00
{%- endraw %}
2017-09-25 06:52:58 -04:00
{%- endif %}
2017-07-25 10:11:43 -04:00
{%- if remote_storage_adapter.get('enabled', False) %}
2018-05-03 09:28:45 -04:00
RemoteStorageAdapterMetricsSendingWarning :
2017-07-25 10:11:43 -04:00
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
if : >-
2018-05-03 09:28:45 -04:00
increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}
2017-07-25 10:11:43 -04:00
{% raw %}
labels :
severity : warning
service : remote_storage_adapter
annotations :
2018-06-08 09:40:14 -04:00
summary : "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"
2018-06-13 07:03:05 -04:00
description : "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}."
2018-05-03 09:28:45 -04:00
{% endraw %}
RemoteStorageAdapterMetricsIgnoredWarning :
2017-07-25 10:11:43 -04:00
{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
if : >-
2018-05-03 09:28:45 -04:00
increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}
2017-07-25 10:11:43 -04:00
{% raw %}
labels :
severity : warning
service : remote_storage_adapter
annotations :
2018-05-03 09:28:45 -04:00
summary : "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"
description : "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."
{%- endraw %}
{%- endif %}
{%- if alertmanager.get('enabled', False) %}
{%- raw %}
AlertmanagerNotificationFailureWarning :
if : >-
increase(alertmanager_notifications_failed_total[2m]) > 0
for : 2m
labels :
severity : warning
service : alertmanager
annotations :
summary : "Alertmanager notifications fail"
2018-06-08 09:40:14 -04:00
description : "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."
2018-05-03 09:28:45 -04:00
AlertmanagerAlertsInvalidWarning :
if : >-
increase(alertmanager_alerts_invalid_total[2m]) > 0
for : 2m
labels :
severity : warning
service : alertmanager
annotations :
summary : "Alertmanager alerts are invalid"
2018-06-08 09:40:14 -04:00
description : "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
2018-05-03 09:28:45 -04:00
{%- endraw %}
2017-07-25 10:11:43 -04:00
{%- endif %}
2018-06-18 08:32:59 -04:00
{%- if relay.get('enabled', False) %}
{%- raw %}
PrometheusRelayServiceDown :
if : >-
procstat_running{process_name="prometheus-relay"} == 0
for : 2m
labels :
severity : minor
service : prometheus
annotations :
summary : "Prometheus relay service is down"
description : "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
PrometheusRelayServiceDownMajor :
if : >-
count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
for : 2m
labels :
severity : major
service : prometheus
annotations :
summary : "50% of Prometheus relay services are down"
description : "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."
PrometheusRelayServiceOutage :
if : >-
count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
for : 2m
labels :
severity : critical
service : prometheus
annotations :
summary : "Prometheus relay service outage"
description : "All Prometheus relay services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) %}
{%- raw %}
PrometheusLTSServiceDown :
if : >-
procstat_running{process_name="prometheus"} == 0
for : 2m
labels :
severity : minor
service : prometheus
annotations :
summary : "Prometheus Long Term Storage service is down"
description : "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
PrometheusRelayServiceDownMajor :
if : >-
count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
for : 2m
labels :
severity : major
service : prometheus
annotations :
summary : "50% of Prometheus Long Term Storage services are down"
description : "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
PrometheusRelayServiceOutage :
if : >-
count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
for : 2m
labels :
severity : critical
service : prometheus
annotations :
summary : "Prometheus Long Term Storage service outage"
description : "All Prometheus Long Term Storage services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
{%- set addresses = [] %}
{%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
{%- do addresses.append(server.bind.address) %}
{%- endif %}
{%- for address in grains['fqdn_ip4'] %}
{%- if not address.startswith('127') %}
{%- do addresses.append(address) %}
{%- endif %}
{%- endfor %}
target :
static :
prometheus_lts :
enabled : True
endpoint :
- address : {{ addresses[0] }}
port : {{ server.bind.port }}
relabel_configs :
- regex : {{ addresses[0] }}:{{ server.bind.port }}
replacement : {{ grains['host'] }}
source_labels : "__address__"
target_label : "host"
prometheus_relay :
enabled : True
endpoint :
- address : {{ addresses[0] }}
port : {{ relay.bind.port }}
relabel_configs :
- regex : {{ addresses[0] }}:{{ relay.bind.port }}
replacement : {{ grains['host'] }}
source_labels : "__address__"
target_label : "host"
{%- endif %}
2017-07-25 10:11:43 -04:00
{%- endif %}