Add prometheus, alertmanager, pushgateway configs

Change-Id: I7f146e0207ada58f0723b14d0a625a6bce67ea9f
This commit is contained in:
Bartosz Kupidura 2017-03-09 12:12:27 +01:00
parent 830b6b2614
commit 2f70396a78
16 changed files with 670 additions and 2 deletions

View file

@ -5,6 +5,110 @@ Salt Prometheus formula
Power your metrics and alerting with a leading open-source monitoring Power your metrics and alerting with a leading open-source monitoring
solution. solution.
Sample pillars
==============
Configure prometheus server
---------------------------
.. code-block:: yaml
prometheus:
server:
enabled: true
dir:
config: /srv/volumes/prometheus-config
config_in_container: /opt/prometheus/config
bind:
port: 9090
address: 0.0.0.0
external_port: 15010
target:
kubernetes:
api_ip: 127.0.0.1
ssl_dir: /opt/prometheus/config
cert_name: kubelet-client.crt
key_name: kubelet-client.key
etcd: ${etcd:server:members}
alert:
PrometheusTargetDown:
if: 'up != 1'
labels:
severity: down
annotations:
summary: 'Prometheus target down'
storage:
local:
engine: "persisted"
retention: "360h"
memory_chunks: 1048576
max_chunks_to_persist: 524288
num_fingerprint_mutexes: 4096
alertmanager:
notification_queue_capacity: 10000
config:
global:
scrape_interval: "15s"
scrape_timeout: "15s"
evaluation_interval: "1m"
external_labels:
region: 'region1'
Configure alertmanager
----------------------
.. code-block:: yaml
prometheus:
alertmanager:
enabled: true
dir:
config: /srv/volumes/prometheus-config
bind:
address: 0.0.0.0
port: 9093
external_port: 15011
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'region', 'service']
group_wait: 60s
group_interval: 5m
repeat_interval: 3h
receiver: HTTP-notification
inhibit_rules:
- source_match:
severity: 'down'
target_match:
severity: 'critical'
equal: ['region', 'service']
- source_match:
severity: 'down'
target_match:
severity: 'warning'
equal: ['region', 'service']
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'region', 'service']
receivers:
- name: 'HTTP-notification'
webhook_configs:
- url: http://127.0.0.1
send_resolved: true
Configure pushgateway
---------------------
.. code-block:: yaml
prometheus:
pushgateway:
enabled: true
external_port: 15012
Documentation and Bugs Documentation and Bugs
====================== ======================

View file

@ -0,0 +1,39 @@
applications:
- alertmanager
parameters:
prometheus:
alertmanager:
enabled: true
bind:
address: 0.0.0.0
port: 9093
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'region', 'service']
group_wait: 60s
group_interval: 5m
repeat_interval: 3h
receiver: HTTP-notification
inhibit_rules:
- source_match:
severity: 'down'
target_match:
severity: 'critical'
equal: ['region', 'service']
- source_match:
severity: 'down'
target_match:
severity: 'warning'
equal: ['region', 'service']
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'region', 'service']
receivers:
- name: 'HTTP-notification'
webhook_configs:
- url: http://127.0.0.1
send_resolved: true

View file

@ -1,6 +1,6 @@
applications: applications:
- prometheus - pushgateway
parameters: parameters:
prometheus: prometheus:
server: pushgateway:
enabled: true enabled: true

View file

@ -0,0 +1,27 @@
applications:
- prometheus
classes:
- service.prometheus.support
parameters:
prometheus:
server:
enabled: true
bind:
port: 9090
address: 0.0.0.0
storage:
local:
engine: "persisted"
retention: "360h"
memory_chunks: 1048576
max_chunks_to_persist: 524288
num_fingerprint_mutexes: 4096
alertmanager:
notification_queue_capacity: 10000
config:
global:
scrape_interval: "15s"
scrape_timeout: "15s"
evaluation_interval: "1m"
external_labels:
region: 'region1'

View file

@ -0,0 +1,5 @@
parameters:
prometheus:
_support:
grafana:
enabled: true

View file

@ -0,0 +1,12 @@
{% from "prometheus/map.jinja" import alertmanager with context %}
{%- if alertmanager.enabled %}
{%- if pillar.docker.host is defined %}
{{alertmanager.dir.config}}/alertmanager.yml:
file.managed:
- source: salt://prometheus/files/alertmanager.yml
- template: jinja
{%- endif %}
{%- endif %}

View file

@ -0,0 +1,20 @@
{% from "prometheus/map.jinja" import alertmanager with context %}
global:
{%- if alertmanager.config.global is defined %}
{{ alertmanager.config.global | yaml(False) | indent(2, true) }}
{%- endif %}
route:
{%- if alertmanager.config.route is defined %}
{{ alertmanager.config.route | yaml(False) | indent(2, true) }}
{%- endif %}
inhibit_rules:
{%- if alertmanager.config.inhibit_rules is defined %}
{{ alertmanager.config.inhibit_rules | yaml(False) | indent(2, true) }}
{%- endif %}
receivers:
{%- if alertmanager.config.receivers is defined %}
{{ alertmanager.config.receivers | yaml(False) | indent(2, true) }}
{%- endif %}

View file

@ -0,0 +1,23 @@
{%- from "prometheus/map.jinja" import server with context %}
{%- if server.alert is defined %}
{%- for alertname, alert in server.alert.iteritems() %}
ALERT {{ alertname }}
IF {{ alert.if }}
{%- if alert.for is defined %}FOR {{ alert.for }}{%- endif %}
{%- if alert.labels is defined %}
LABELS {
{%- for name, value in alert.labels.iteritems() %}
{{ name }} = "{{ value }}"{%- if not loop.last %},{%- endif %}
{%- endfor %}
}
{%- endif %}
{%- if alert.annotations is defined %}
ANNOTATIONS {
{%- for name, value in alert.annotations.iteritems() %}
{{ name }} = "{{ value }}"{%- if not loop.last %},{%- endif %}
{%- endfor %}
}
{%- endif %}
{%- endfor %}
{%- endif %}

View file

@ -0,0 +1,174 @@
{%- from "prometheus/map.jinja" import server with context %}
global:
{%- if server.get('config', {}).global is defined %}
{{ server.config.global | yaml(False) | indent(2, true) }}
{%- endif %}
rule_files:
- {{ server.dir.config_in_container }}/alerts.yml
scrape_configs:
{%- set telegraf_nodes = [] %}
{%- for node_name, node_grains in salt['mine.get']('*', 'grains.items').iteritems() %}
{%- if 'telegraf' in node_grains.get('services') %}
{%- set node_ip = node_grains.get('prometheus_client').get('address') %}
{%- set node_port = node_grains.get('prometheus_client').get('port') %}
{%- set telegraf_address = "'%s:%d'" | format(node_ip, node_port) %}
{%- do telegraf_nodes.append(telegraf_address) %}
{%- endif %}
{%- endfor %}
{%- if telegraf_nodes|length > 0 %}
- job_name: 'telegraf'
static_configs:
- targets: [{{ telegraf_nodes | join(',') }}]
{%- endif %}
{% if server.get('target', {}).etcd is defined %}
{%- set etcd_nodes = [] %}
{%- for node in server.target.etcd %}
{%- set etcd_address = "'%s:%d'" | format(node.host, node.port) %}
{%- do etcd_nodes.append(etcd_address) %}
{%- endfor %}
{%- if etcd_nodes|length > 0 %}
- job_name: 'etcd'
static_configs:
- targets: [{{ etcd_nodes | join(',') }}]
{%- endif %}
{%- endif %}
{% if server.get('target', {}).kubernetes is defined %}
- job_name: 'kubernetes-api'
scheme: https
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
kubernetes_sd_configs:
- api_server: {{ server.target.kubernetes.api_ip }}
role: endpoints
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-node'
scheme: https
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
kubernetes_sd_configs:
- api_server: {{ server.target.kubernetes.api_ip }}
role: node
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-service-endpoint'
scheme: https
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
kubernetes_sd_configs:
- api_server: {{ server.target.kubernetes.api_ip }}
role: endpoints
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_service_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- source_labels: [__meta_kubernetes_pod_node_name]
action: replace
target_label: kubernetes_io_hostname
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'kubernetes-pod'
scheme: https
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
kubernetes_sd_configs:
- api_server: {{ server.target.kubernetes.api_ip }}
role: pod
tls_config:
insecure_skip_verify: true
{% if server.target.kubernetes.cert_name is defined %}cert_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.cert_name }}{%- endif %}
{% if server.target.kubernetes.key_name is defined %}key_file: {{ server.target.kubernetes.ssl_dir }}/{{ server.target.kubernetes.key_name }}{%- endif %}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: (.+):(?:\d+);(\d+)
replacement: ${1}:${2}
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_pod_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
{%- endif %}
- job_name: 'pushgateway'
dns_sd_configs:
- names:
- 'tasks.pushgateway'
type: 'A'
port: 9091
- job_name: 'prometheus'
dns_sd_configs:
- names:
- 'tasks.prometheus'
type: 'A'
port: {{ server.bind.port }}

0
prometheus/init.sls Normal file
View file

16
prometheus/map.jinja Normal file
View file

@ -0,0 +1,16 @@
{% set server = salt['grains.filter_by']({
'default': {
'dir': {
'config': '/srv/volumes/prometheus-config',
'config_in_container': '/opt/prometheus/config'
},
},
}, merge=salt['pillar.get']('prometheus:server')) %}
{% set alertmanager = salt['grains.filter_by']({
'default': {
'dir': {
'config': '/srv/volumes/prometheus-config',
},
},
}, merge=salt['pillar.get']('prometheus:alertmanager')) %}

17
prometheus/server.sls Normal file
View file

@ -0,0 +1,17 @@
{% from "prometheus/map.jinja" import server with context %}
{%- if server.enabled %}
{%- if pillar.docker.host is defined %}
{{server.dir.config}}/prometheus.yml:
file.managed:
- source: salt://prometheus/files/prometheus.yml
- template: jinja
{{server.dir.config}}/alerts.yml:
file.managed:
- source: salt://prometheus/files/alerts.yml
- template: jinja
{%- endif %}
{%- endif %}

View file

@ -0,0 +1,36 @@
prometheus:
alertmanager:
enabled: true
bind:
address: 0.0.0.0
port: 9093
config:
global:
resolve_timeout: 5m
route:
group_by: ['alertname', 'region', 'service']
group_wait: 60s
group_interval: 5m
repeat_interval: 3h
receiver: HTTP-notification
inhibit_rules:
- source_match:
severity: 'down'
target_match:
severity: 'critical'
equal: ['region', 'service']
- source_match:
severity: 'down'
target_match:
severity: 'warning'
equal: ['region', 'service']
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'region', 'service']
receivers:
- name: 'HTTP-notification'
webhook_configs:
- url: http://127.0.0.1
send_resolved: true

View file

@ -0,0 +1,3 @@
prometheus:
pushgateway:
enabled: true

View file

@ -0,0 +1,30 @@
prometheus:
server:
enabled: true
bind:
port: 9090
address: 0.0.0.0
storage:
local:
engine: "persisted"
retention: "360h"
memory_chunks: 1048576
max_chunks_to_persist: 524288
num_fingerprint_mutexes: 4096
alertmanager:
notification_queue_capacity: 10000
config:
global:
scrape_interval: "15s"
scrape_timeout: "15s"
evaluation_interval: "1m"
external_labels:
region: 'region1'
alert:
PrometheusTargetDownKubernetesNodes:
if: 'up{job="kubernetes-nodes"} != 1'
labels:
severity: down
service: prometheus
annotations:
summary: 'Prometheus target down'

162
tests/run_tests.sh Executable file
View file

@ -0,0 +1,162 @@
#!/usr/bin/env bash
set -e
[ -n "$DEBUG" ] && set -x
CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
METADATA=${CURDIR}/../metadata.yml
FORMULA_NAME=$(cat $METADATA | python -c "import sys,yaml; print yaml.load(sys.stdin)['name']")
## Overrideable parameters
PILLARDIR=${PILLARDIR:-${CURDIR}/pillar}
BUILDDIR=${BUILDDIR:-${CURDIR}/build}
VENV_DIR=${VENV_DIR:-${BUILDDIR}/virtualenv}
DEPSDIR=${BUILDDIR}/deps
SALT_FILE_DIR=${SALT_FILE_DIR:-${BUILDDIR}/file_root}
SALT_PILLAR_DIR=${SALT_PILLAR_DIR:-${BUILDDIR}/pillar_root}
SALT_CONFIG_DIR=${SALT_CONFIG_DIR:-${BUILDDIR}/salt}
SALT_CACHE_DIR=${SALT_CACHE_DIR:-${SALT_CONFIG_DIR}/cache}
SALT_OPTS="${SALT_OPTS} --retcode-passthrough --local -c ${SALT_CONFIG_DIR}"
if [ "x${SALT_VERSION}" != "x" ]; then
PIP_SALT_VERSION="==${SALT_VERSION}"
fi
## Functions
log_info() {
echo "[INFO] $*"
}
log_err() {
echo "[ERROR] $*" >&2
}
setup_virtualenv() {
log_info "Setting up Python virtualenv"
virtualenv $VENV_DIR
source ${VENV_DIR}/bin/activate
pip install salt${PIP_SALT_VERSION}
}
setup_pillar() {
[ ! -d ${SALT_PILLAR_DIR} ] && mkdir -p ${SALT_PILLAR_DIR}
echo "base:" > ${SALT_PILLAR_DIR}/top.sls
for pillar in ${PILLARDIR}/*; do
state_name=$(basename ${pillar%.sls})
echo -e " ${state_name}:\n - ${state_name}" >> ${SALT_PILLAR_DIR}/top.sls
done
}
setup_salt() {
[ ! -d ${SALT_FILE_DIR} ] && mkdir -p ${SALT_FILE_DIR}
[ ! -d ${SALT_CONFIG_DIR} ] && mkdir -p ${SALT_CONFIG_DIR}
[ ! -d ${SALT_CACHE_DIR} ] && mkdir -p ${SALT_CACHE_DIR}
echo "base:" > ${SALT_FILE_DIR}/top.sls
for pillar in ${PILLARDIR}/*.sls; do
state_name=$(basename ${pillar%.sls})
echo -e " ${state_name}:\n - ${FORMULA_NAME}" >> ${SALT_FILE_DIR}/top.sls
done
cat << EOF > ${SALT_CONFIG_DIR}/minion
file_client: local
cachedir: ${SALT_CACHE_DIR}
verify_env: False
file_roots:
base:
- ${SALT_FILE_DIR}
- ${CURDIR}/..
- /usr/share/salt-formulas/env
pillar_roots:
base:
- ${SALT_PILLAR_DIR}
- ${PILLARDIR}
EOF
}
fetch_dependency() {
dep_name="$(echo $1|cut -d : -f 1)"
dep_source="$(echo $1|cut -d : -f 2-)"
dep_root="${DEPSDIR}/$(basename $dep_source .git)"
dep_metadata="${dep_root}/metadata.yml"
[ -d /usr/share/salt-formulas/env/${dep_name} ] && log_info "Dependency $dep_name already present in system-wide salt env" && return 0
[ -d $dep_root ] && log_info "Dependency $dep_name already fetched" && return 0
log_info "Fetching dependency $dep_name"
[ ! -d ${DEPSDIR} ] && mkdir -p ${DEPSDIR}
git clone $dep_source ${DEPSDIR}/$(basename $dep_source .git)
ln -s ${dep_root}/${dep_name} ${SALT_FILE_DIR}/${dep_name}
METADATA="${dep_metadata}" install_dependencies
}
install_dependencies() {
grep -E "^dependencies:" ${METADATA} >/dev/null || return 0
(python - | while read dep; do fetch_dependency "$dep"; done) << EOF
import sys,yaml
for dep in yaml.load(open('${METADATA}', 'ro'))['dependencies']:
print '%s:%s' % (dep["name"], dep["source"])
EOF
}
clean() {
log_info "Cleaning up ${BUILDDIR}"
[ -d ${BUILDDIR} ] && rm -rf ${BUILDDIR} || exit 0
}
salt_run() {
[ -e ${VEN_DIR}/bin/activate ] && source ${VENV_DIR}/bin/activate
salt-call ${SALT_OPTS} $*
}
prepare() {
[ -d ${BUILDDIR} ] && mkdir -p ${BUILDDIR}
which salt-call || setup_virtualenv
setup_pillar
setup_salt
install_dependencies
}
run() {
for pillar in ${PILLARDIR}/*.sls; do
state_name=$(basename ${pillar%.sls})
salt_run --id=${state_name} state.show_sls ${FORMULA_NAME} || (log_err "Execution of ${FORMULA_NAME}.${state_name} failed"; exit 1)
done
}
_atexit() {
RETVAL=$?
trap true INT TERM EXIT
if [ $RETVAL -ne 0 ]; then
log_err "Execution failed"
else
log_info "Execution successful"
fi
return $RETVAL
}
## Main
trap _atexit INT TERM EXIT
case $1 in
clean)
clean
;;
prepare)
prepare
;;
run)
run
;;
*)
prepare
run
;;
esac