From c3716cb1e90cac815a9c369c78aa83c6556fe098 Mon Sep 17 00:00:00 2001 From: Dmitry Kalashnik Date: Mon, 18 Jun 2018 16:32:59 +0400 Subject: [PATCH] Add alerts and targets for prometheus LTS and relay Closes-Bug: PROD-20724 Change-Id: I1f4839a4900a2d417d85a52ffef6e11e4bb2cac1 --- metadata/service/support.yml | 2 + prometheus/meta/prometheus.yml | 103 ++++++++++++++++++++++++++++++++- prometheus/meta/telegraf.yml | 17 ++++++ 3 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 prometheus/meta/telegraf.yml diff --git a/metadata/service/support.yml b/metadata/service/support.yml index df2b7c3..e4d3cb4 100644 --- a/metadata/service/support.yml +++ b/metadata/service/support.yml @@ -5,3 +5,5 @@ parameters: enabled: true grafana: enabled: true + telegraf: + enabled: true diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml index 80768e2..4451e8f 100644 --- a/prometheus/meta/prometheus.yml +++ b/prometheus/meta/prometheus.yml @@ -1,5 +1,5 @@ {%- if pillar.prometheus is defined %} -{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %} +{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %} server: alert: {%- if server.get('enabled', False) %} @@ -117,4 +117,105 @@ server: description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes." {%- endraw %} {%- endif %} +{%- if relay.get('enabled', False) %} +{%- raw %} + PrometheusRelayServiceDown: + if: >- + procstat_running{process_name="prometheus-relay"} == 0 + for: 2m + labels: + severity: minor + service: prometheus + annotations: + summary: "Prometheus relay service is down" + description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes." + PrometheusRelayServiceDownMajor: + if: >- + count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5 + for: 2m + labels: + severity: major + service: prometheus + annotations: + summary: "50% of Prometheus relay services are down" + description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes." + PrometheusRelayServiceOutage: + if: >- + count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"}) + for: 2m + labels: + severity: critical + service: prometheus + annotations: + summary: "Prometheus relay service outage" + description: "All Prometheus relay services are down for 2 minutes." +{%- endraw %} +{%- endif %} +{%- if server.get("enabled", False) and not server.get("is_container", True) %} +{%- raw %} + PrometheusLTSServiceDown: + if: >- + procstat_running{process_name="prometheus"} == 0 + for: 2m + labels: + severity: minor + service: prometheus + annotations: + summary: "Prometheus Long Term Storage service is down" + description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes." + PrometheusRelayServiceDownMajor: + if: >- + count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5 + for: 2m + labels: + severity: major + service: prometheus + annotations: + summary: "50% of Prometheus Long Term Storage services are down" + description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes." + PrometheusRelayServiceOutage: + if: >- + count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"}) + for: 2m + labels: + severity: critical + service: prometheus + annotations: + summary: "Prometheus Long Term Storage service outage" + description: "All Prometheus Long Term Storage services are down for 2 minutes." +{%- endraw %} +{%- endif %} +{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %} + {%- set addresses = [] %} + {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %} + {%- do addresses.append(server.bind.address) %} + {%- endif %} + {%- for address in grains['fqdn_ip4'] %} + {%- if not address.startswith('127') %} + {%- do addresses.append(address) %} + {%- endif %} + {%- endfor %} + target: + static: + prometheus_lts: + enabled: True + endpoint: + - address: {{ addresses[0] }} + port: {{ server.bind.port }} + relabel_configs: + - regex: {{ addresses[0] }}:{{ server.bind.port }} + replacement: {{ grains['host'] }} + source_labels: "__address__" + target_label: "host" + prometheus_relay: + enabled: True + endpoint: + - address: {{ addresses[0] }} + port: {{ relay.bind.port }} + relabel_configs: + - regex: {{ addresses[0] }}:{{ relay.bind.port }} + replacement: {{ grains['host'] }} + source_labels: "__address__" + target_label: "host" +{%- endif %} {%- endif %} diff --git a/prometheus/meta/telegraf.yml b/prometheus/meta/telegraf.yml new file mode 100644 index 0000000..3cce8a9 --- /dev/null +++ b/prometheus/meta/telegraf.yml @@ -0,0 +1,17 @@ +{%- if pillar.prometheus is defined %} +{%- from "prometheus/map.jinja" import server, relay with context %} +agent: + input: + {%- if relay.get('enabled', False) and server.get("enabled", False) and not server.get("is_container", True) %} + procstat: + process: + {%- if relay.get('enabled', False) %} + prometheus-relay: + pattern: '/usr/bin/prometheus-relay' + {%- endif %} + {%- if server.get("enabled", False) and not server.get("is_container", True) %} + prometheus: + pattern: '/usr/bin/prometheus[^-]' + {%- endif %} + {% endif %} +{%- endif %}