formula-prometheus/prometheus/meta/prometheus.yml

{%- if pillar.prometheus is defined %}
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
server:
  alert:
{%- if server.get('enabled', False) %}
{% raw %}
    PrometheusTargetDown:
      if: up != 1
      for: 2m
      labels:
        severity: critical
        service: prometheus
      annotations:
        summary: "Prometheus target is down"
        description: "The Prometheus target for the {{ $labels.job }} job on the {{ $labels.host or $labels.instance }} node is down for 2 minutes."
    PrometheusTargetSamplesOrderWarning:
      if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0
      labels:
        severity: warning
        service: prometheus
      annotations:
        summary: "Prometheus samples are out of order"
        description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."
    PrometheusTargetSamplesBoundsWarning:
      if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0
      labels:
        severity: warning
        service: prometheus
      annotations:
        summary: "Prometheus samples timestamps are out of bounds"
        description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."
    PrometheusTargetSamplesDuplicateWarning:
      if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0
      labels:
        severity: warning
        service: prometheus
      annotations:
        summary: "Prometheus samples have duplicate timestamps"
        description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."
{% endraw %}
{%- if server.version == 1.7 %}
{% raw %}
    PrometheusDataIngestionWarning:
      if: prometheus_local_storage_rushed_mode != 0
      for: 10m
      labels:
        severity: warning
        service: prometheus
      annotations:
        summary: "Prometheus is in the rushed mode"
        description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."
{% endraw %}
{%- endif %}
{%- endif %}
{%- if server.get('config', {}).get('remote_write') %}
    PrometheusRemoteStorageQueueFullWarning:
      {%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
      if: >-
        prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
{%- raw %}
      for: 2m
      labels:
        severity: warning
        service: prometheus
      annotations:
        summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"
        description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if remote_storage_adapter.get('enabled', False) %}
    RemoteStorageAdapterMetricsSendingWarning:
      {%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
      if: >-
        increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}
{% raw %}
      labels:
        severity: warning
        service: remote_storage_adapter
      annotations:
        summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"
        description: "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}."
{% endraw %}
    RemoteStorageAdapterMetricsIgnoredWarning:
      {%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
      if: >-
        increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}
{% raw %}
      labels:
        severity: warning
        service: remote_storage_adapter
      annotations:
        summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"
        description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."
{%- endraw %}
{%- endif %}
{%- if alertmanager.get('enabled', False) %}
{%- raw %}
    AlertmanagerNotificationFailureWarning:
      if: >-
        increase(alertmanager_notifications_failed_total[2m]) > 0
      for: 2m
      labels:
        severity: warning
        service: alertmanager
      annotations:
        summary: "Alertmanager notifications fail"
        description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."
    AlertmanagerAlertsInvalidWarning:
      if: >-
        increase(alertmanager_alerts_invalid_total[2m]) > 0
      for: 2m
      labels:
        severity: warning
        service: alertmanager
      annotations:
        summary: "Alertmanager alerts are invalid"
        description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if relay.get('enabled', False) %}
{%- raw %}
    PrometheusRelayServiceDown:
      if: >-
        procstat_running{process_name="prometheus-relay"} == 0
      for: 2m
      labels:
        severity: minor
        service: prometheus
      annotations:
        summary: "Prometheus relay service is down"
        description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
    PrometheusRelayServiceDownMajor:
      if: >-
        count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
      for: 2m
      labels:
        severity: major
        service: prometheus
      annotations:
        summary: "50% of Prometheus relay services are down"
        description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."
    PrometheusRelayServiceOutage:
      if: >-
        count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
      for: 2m
      labels:
        severity: critical
        service: prometheus
      annotations:
        summary: "Prometheus relay service outage"
        description: "All Prometheus relay services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) %}
{%- raw %}
    PrometheusLTSServiceDown:
      if: >-
        procstat_running{process_name="prometheus"} == 0
      for: 2m
      labels:
        severity: minor
        service: prometheus
      annotations:
        summary: "Prometheus Long Term Storage service is down"
        description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
    PrometheusRelayServiceDownMajor:
      if: >-
        count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
      for: 2m
      labels:
        severity: major
        service: prometheus
      annotations:
        summary: "50% of Prometheus Long Term Storage services are down"
        description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
    PrometheusRelayServiceOutage:
      if: >-
        count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
      for: 2m
      labels:
        severity: critical
        service: prometheus
      annotations:
        summary: "Prometheus Long Term Storage service outage"
        description: "All Prometheus Long Term Storage services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
  {%- set addresses = [] %}
  {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
    {%- do addresses.append(server.bind.address) %}
  {%- endif %}
  {%- for address in grains['fqdn_ip4'] %}
    {%- if not address.startswith('127') %}
      {%- do addresses.append(address) %}
    {%- endif %}
  {%- endfor %}
  target:
    static:
      prometheus_lts:
        enabled: True
        endpoint:
          - address: {{ addresses[0] }}
            port: {{ server.bind.port }}
        relabel_configs:
          - regex: {{ addresses[0] }}:{{ server.bind.port }}
            replacement: {{ grains['host'] }}
            source_labels: "__address__"
            target_label: "host"
      prometheus_relay:
        enabled: True
        endpoint:
          - address: {{ addresses[0] }}
            port: {{ relay.bind.port }}
        relabel_configs:
          - regex: {{ addresses[0] }}:{{ relay.bind.port }}
            replacement: {{ grains['host'] }}
            source_labels: "__address__"
            target_label: "host"
{%- endif %}
{%- endif %}
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- if pillar.prometheus is defined %}`
Add alerts and targets for prometheus LTS and relay Closes-Bug: PROD-20724 Change-Id: I1f4839a4900a2d417d85a52ffef6e11e4bb2cac1 2018-06-18 08:32:59 -04:00			`{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}`
Add support.yml for alerts and recording rules Change-Id: If1927033922c350257999f59ba3031445689e11b 2017-04-11 06:17:08 -04:00			`server:`
			`alert:`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- if server.get('enabled', False) %}`
			`{% raw %}`
Rename Prometheus alerts for consistency Change-Id: I96fb789bf73af22d56fc6c6980626647f87409d4 2017-07-24 09:38:28 -04:00			`PrometheusTargetDown:`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`if: up != 1`
Trigger the target down alert after 2 minutes Otherwise the alert fires as soon as Prometheus can't scrape a target. It is too aggressive in case of transient connectivity issues or endpoint restart. Change-Id: Ib3de5b141db7a7f2397bf332844a9c44d38f2d3c 2017-09-12 09:14:21 -04:00			`for: 2m`
Add support.yml for alerts and recording rules Change-Id: If1927033922c350257999f59ba3031445689e11b 2017-04-11 06:17:08 -04:00			`labels:`
			`severity: critical`
			`service: prometheus`
			`annotations:`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`summary: "Prometheus target is down"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "The Prometheus target for the {{ $labels.job }} job on the {{ $labels.host or $labels.instance }} node is down for 2 minutes."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`PrometheusTargetSamplesOrderWarning:`
			`if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0`
			`labels:`
			`severity: warning`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus samples are out of order"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`PrometheusTargetSamplesBoundsWarning:`
			`if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0`
			`labels:`
			`severity: warning`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus samples timestamps are out of bounds"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`PrometheusTargetSamplesDuplicateWarning:`
			`if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0`
			`labels:`
			`severity: warning`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus samples have duplicate timestamps"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."`
Add support for prometheus 2.0 New version changes: * different alerts format * rewritten storage (some config flags removed) Closes-Bug: PROD-16609 Change-Id: I805fa322e4744e98177d6c3e29589ebc6fb917a2 2017-12-20 05:28:41 -05:00			`{% endraw %}`
			`{%- if server.version == 1.7 %}`
			`{% raw %}`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`PrometheusDataIngestionWarning:`
			`if: prometheus_local_storage_rushed_mode != 0`
Add prometheus alerts * PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52 2017-09-25 06:52:58 -04:00			`for: 10m`
			`labels:`
			`severity: warning`
			`service: prometheus`
			`annotations:`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`summary: "Prometheus is in the rushed mode"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."`
Add support.yml for alerts and recording rules Change-Id: If1927033922c350257999f59ba3031445689e11b 2017-04-11 06:17:08 -04:00			`{% endraw %}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- endif %}`
Add support for prometheus 2.0 New version changes: * different alerts format * rewritten storage (some config flags removed) Closes-Bug: PROD-16609 Change-Id: I805fa322e4744e98177d6c3e29589ebc6fb917a2 2017-12-20 05:28:41 -05:00			`{%- endif %}`
Add prometheus alerts * PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52 2017-09-25 06:52:58 -04:00			`{%- if server.get('config', {}).get('remote_write') %}`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`PrometheusRemoteStorageQueueFullWarning:`
Add prometheus alerts * PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52 2017-09-25 06:52:58 -04:00			`{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent\|float %}`
			`if: >-`
			`prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}`
			`{%- raw %}`
			`for: 2m`
			`labels:`
			`severity: warning`
			`service: prometheus`
			`annotations:`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"`
			`description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`{%- endraw %}`
Add prometheus alerts * PrometheusRushMode * PrometheusRemoteStorageQueue * AlertmanagerNotificationFailed Change-Id: I5a875e7b9861f860bac501da55f0e8b20e799d52 2017-09-25 06:52:58 -04:00			`{%- endif %}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- if remote_storage_adapter.get('enabled', False) %}`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`RemoteStorageAdapterMetricsSendingWarning:`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio\|float %}`
			`if: >-`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{% raw %}`
			`labels:`
			`severity: warning`
			`service: remote_storage_adapter`
			`annotations:`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"`
Fix typo Change-Id: I52958fe30538491b0c8b150b3a206b8009b537c1 2018-06-13 07:03:05 -04:00			`description: "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`{% endraw %}`
			`RemoteStorageAdapterMetricsIgnoredWarning:`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio\|float %}`
			`if: >-`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{% raw %}`
			`labels:`
			`severity: warning`
			`service: remote_storage_adapter`
			`annotations:`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"`
			`description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."`
			`{%- endraw %}`
			`{%- endif %}`
			`{%- if alertmanager.get('enabled', False) %}`
			`{%- raw %}`
			`AlertmanagerNotificationFailureWarning:`
			`if: >-`
			`increase(alertmanager_notifications_failed_total[2m]) > 0`
			`for: 2m`
			`labels:`
			`severity: warning`
			`service: alertmanager`
			`annotations:`
			`summary: "Alertmanager notifications fail"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`AlertmanagerAlertsInvalidWarning:`
			`if: >-`
			`increase(alertmanager_alerts_invalid_total[2m]) > 0`
			`for: 2m`
			`labels:`
			`severity: warning`
			`service: alertmanager`
			`annotations:`
			`summary: "Alertmanager alerts are invalid"`
Cosmetic changes for alerts Change-Id: I9e6b2f4a5876e7d5697236166b4a6dc30cf4615a Closes-bug: PROD-20466 2018-06-08 09:40:14 -04:00			`description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."`
Alerts reworked Change alerts names, severities and descriptions. Change-Id: Ib06f08a6f336d28592d5f70e97aedfeb12eb603c Closes-bug: PROD-19698 2018-05-03 09:28:45 -04:00			`{%- endraw %}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- endif %}`
Add alerts and targets for prometheus LTS and relay Closes-Bug: PROD-20724 Change-Id: I1f4839a4900a2d417d85a52ffef6e11e4bb2cac1 2018-06-18 08:32:59 -04:00			`{%- if relay.get('enabled', False) %}`
			`{%- raw %}`
			`PrometheusRelayServiceDown:`
			`if: >-`
			`procstat_running{process_name="prometheus-relay"} == 0`
			`for: 2m`
			`labels:`
			`severity: minor`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus relay service is down"`
			`description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."`
			`PrometheusRelayServiceDownMajor:`
			`if: >-`
			`count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5`
			`for: 2m`
			`labels:`
			`severity: major`
			`service: prometheus`
			`annotations:`
			`summary: "50% of Prometheus relay services are down"`
			`description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."`
			`PrometheusRelayServiceOutage:`
			`if: >-`
			`count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})`
			`for: 2m`
			`labels:`
			`severity: critical`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus relay service outage"`
			`description: "All Prometheus relay services are down for 2 minutes."`
			`{%- endraw %}`
			`{%- endif %}`
			`{%- if server.get("enabled", False) and not server.get("is_container", True) %}`
			`{%- raw %}`
			`PrometheusLTSServiceDown:`
			`if: >-`
			`procstat_running{process_name="prometheus"} == 0`
			`for: 2m`
			`labels:`
			`severity: minor`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus Long Term Storage service is down"`
			`description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."`
			`PrometheusRelayServiceDownMajor:`
			`if: >-`
			`count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5`
			`for: 2m`
			`labels:`
			`severity: major`
			`service: prometheus`
			`annotations:`
			`summary: "50% of Prometheus Long Term Storage services are down"`
			`description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."`
			`PrometheusRelayServiceOutage:`
			`if: >-`
			`count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})`
			`for: 2m`
			`labels:`
			`severity: critical`
			`service: prometheus`
			`annotations:`
			`summary: "Prometheus Long Term Storage service outage"`
			`description: "All Prometheus Long Term Storage services are down for 2 minutes."`
			`{%- endraw %}`
			`{%- endif %}`
			`{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}`
			`{%- set addresses = [] %}`
			`{%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}`
			`{%- do addresses.append(server.bind.address) %}`
			`{%- endif %}`
			`{%- for address in grains['fqdn_ip4'] %}`
			`{%- if not address.startswith('127') %}`
			`{%- do addresses.append(address) %}`
			`{%- endif %}`
			`{%- endfor %}`
			`target:`
			`static:`
			`prometheus_lts:`
			`enabled: True`
			`endpoint:`
			`- address: {{ addresses[0] }}`
			`port: {{ server.bind.port }}`
			`relabel_configs:`
			`- regex: {{ addresses[0] }}:{{ server.bind.port }}`
			`replacement: {{ grains['host'] }}`
			`source_labels: "__address__"`
			`target_label: "host"`
			`prometheus_relay:`
			`enabled: True`
			`endpoint:`
			`- address: {{ addresses[0] }}`
			`port: {{ relay.bind.port }}`
			`relabel_configs:`
			`- regex: {{ addresses[0] }}:{{ relay.bind.port }}`
			`replacement: {{ grains['host'] }}`
			`source_labels: "__address__"`
			`target_label: "host"`
			`{%- endif %}`
Add Prometheus alerts Change-Id: I4ad10555d728d62c8e6504659d30558f95b410ac 2017-07-25 10:11:43 -04:00			`{%- endif %}`