Add alerts and targets for prometheus LTS and relay

Closes-Bug: PROD-20724
Change-Id: I1f4839a4900a2d417d85a52ffef6e11e4bb2cac1
This commit is contained in:
Dmitry Kalashnik 2018-06-18 16:32:59 +04:00
parent d257cdbfe2
commit c3716cb1e9
3 changed files with 121 additions and 1 deletions

View file

@ -5,3 +5,5 @@ parameters:
enabled: true enabled: true
grafana: grafana:
enabled: true enabled: true
telegraf:
enabled: true

View file

@ -1,5 +1,5 @@
{%- if pillar.prometheus is defined %} {%- if pillar.prometheus is defined %}
{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %} {%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
server: server:
alert: alert:
{%- if server.get('enabled', False) %} {%- if server.get('enabled', False) %}
@ -117,4 +117,105 @@ server:
description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes." description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
{%- endraw %} {%- endraw %}
{%- endif %} {%- endif %}
{%- if relay.get('enabled', False) %}
{%- raw %}
PrometheusRelayServiceDown:
if: >-
procstat_running{process_name="prometheus-relay"} == 0
for: 2m
labels:
severity: minor
service: prometheus
annotations:
summary: "Prometheus relay service is down"
description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
PrometheusRelayServiceDownMajor:
if: >-
count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
for: 2m
labels:
severity: major
service: prometheus
annotations:
summary: "50% of Prometheus relay services are down"
description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."
PrometheusRelayServiceOutage:
if: >-
count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
for: 2m
labels:
severity: critical
service: prometheus
annotations:
summary: "Prometheus relay service outage"
description: "All Prometheus relay services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) %}
{%- raw %}
PrometheusLTSServiceDown:
if: >-
procstat_running{process_name="prometheus"} == 0
for: 2m
labels:
severity: minor
service: prometheus
annotations:
summary: "Prometheus Long Term Storage service is down"
description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
PrometheusRelayServiceDownMajor:
if: >-
count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
for: 2m
labels:
severity: major
service: prometheus
annotations:
summary: "50% of Prometheus Long Term Storage services are down"
description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
PrometheusRelayServiceOutage:
if: >-
count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
for: 2m
labels:
severity: critical
service: prometheus
annotations:
summary: "Prometheus Long Term Storage service outage"
description: "All Prometheus Long Term Storage services are down for 2 minutes."
{%- endraw %}
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
{%- set addresses = [] %}
{%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
{%- do addresses.append(server.bind.address) %}
{%- endif %}
{%- for address in grains['fqdn_ip4'] %}
{%- if not address.startswith('127') %}
{%- do addresses.append(address) %}
{%- endif %}
{%- endfor %}
target:
static:
prometheus_lts:
enabled: True
endpoint:
- address: {{ addresses[0] }}
port: {{ server.bind.port }}
relabel_configs:
- regex: {{ addresses[0] }}:{{ server.bind.port }}
replacement: {{ grains['host'] }}
source_labels: "__address__"
target_label: "host"
prometheus_relay:
enabled: True
endpoint:
- address: {{ addresses[0] }}
port: {{ relay.bind.port }}
relabel_configs:
- regex: {{ addresses[0] }}:{{ relay.bind.port }}
replacement: {{ grains['host'] }}
source_labels: "__address__"
target_label: "host"
{%- endif %}
{%- endif %} {%- endif %}

View file

@ -0,0 +1,17 @@
{%- if pillar.prometheus is defined %}
{%- from "prometheus/map.jinja" import server, relay with context %}
agent:
input:
{%- if relay.get('enabled', False) and server.get("enabled", False) and not server.get("is_container", True) %}
procstat:
process:
{%- if relay.get('enabled', False) %}
prometheus-relay:
pattern: '/usr/bin/prometheus-relay'
{%- endif %}
{%- if server.get("enabled", False) and not server.get("is_container", True) %}
prometheus:
pattern: '/usr/bin/prometheus[^-]'
{%- endif %}
{% endif %}
{%- endif %}