formula-prometheus/prometheus/meta/prometheus.yml

121 lines
5.3 KiB
YAML
Raw Normal View History

{%- if pillar.prometheus is defined %}
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %}
server:
alert:
{%- if server.get('enabled', False) %}
{% raw %}
PrometheusTargetDown:
if: up != 1
for: 2m
labels:
severity: critical
service: prometheus
annotations:
summary: "Prometheus target is down"
description: "The Prometheus target for the {{ $labels.job }} job on the {{ $labels.host or $labels.instance }} node is down for 2 minutes."
PrometheusTargetSamplesOrderWarning:
if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus samples are out of order"
description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."
PrometheusTargetSamplesBoundsWarning:
if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus samples timestamps are out of bounds"
description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."
PrometheusTargetSamplesDuplicateWarning:
if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus samples have duplicate timestamps"
description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."
{% endraw %}
{%- if server.version == 1.7 %}
{% raw %}
PrometheusDataIngestionWarning:
if: prometheus_local_storage_rushed_mode != 0
for: 10m
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus is in the rushed mode"
description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."
{% endraw %}
{%- endif %}
{%- endif %}
{%- if server.get('config', {}).get('remote_write') %}
PrometheusRemoteStorageQueueFullWarning:
{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
if: >-
prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: prometheus
annotations:
summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"
description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if remote_storage_adapter.get('enabled', False) %}
RemoteStorageAdapterMetricsSendingWarning:
{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
if: >-
increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}
{% raw %}
labels:
severity: warning
service: remote_storage_adapter
annotations:
summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"
description: "The remote storage adapter metrics on sent to received ratio the {{ $labels.instance }} instance is {{ $value }}."
{% endraw %}
RemoteStorageAdapterMetricsIgnoredWarning:
{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
if: >-
increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}
{% raw %}
labels:
severity: warning
service: remote_storage_adapter
annotations:
summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"
description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."
{%- endraw %}
{%- endif %}
{%- if alertmanager.get('enabled', False) %}
{%- raw %}
AlertmanagerNotificationFailureWarning:
if: >-
increase(alertmanager_notifications_failed_total[2m]) > 0
for: 2m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager notifications fail"
description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."
AlertmanagerAlertsInvalidWarning:
if: >-
increase(alertmanager_alerts_invalid_total[2m]) > 0
for: 2m
labels:
severity: warning
service: alertmanager
annotations:
summary: "Alertmanager alerts are invalid"
description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
{%- endraw %}
{%- endif %}
{%- endif %}