Prometheus+Grafana监控平台+钉钉报警
FROM alpine:3.11
RUN apk update \
&& apk add --no-cache nginx tzdata \
&& cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& echo "Asia/Shanghai" > /etc/timezone \
&& apk del tzdata \
&& mkdir /run/nginx/ \
&& rm -rf /var/cache/apk/*
EXPOSE 80 443
#ENTRYPOINT ["/run.sh"]
CMD ["nginx", "-g", "daemon off;"]
[root@kp ~]# docker build -t mmp:v1 .
Sending build context to Docker daemon 529.9kB
Step 1/4 : FROM alpine:3.11
---> d2bcc698e2fc
Step 2/4 : RUN apk update && apk add --no-cache nginx tzdata && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && echo "Asia/Shanghai" > /etc/timezone && apk del tzdata && mkdir /run/nginx/ && rm -rf /var/cache/apk/*
---> Using cache
---> d7fa977b40ea
Step 3/4 : EXPOSE 80 443
---> Using cache
---> 1b5140208a9b
Step 4/4 : CMD ["nginx", "-g", "daemon off;"]
---> Using cache
---> 7c36301aee5b
Successfully built 7c36301aee5b
Successfully tagged mmp:v1
[root@kp ~]# docker run -d --name mmp mmp:v1
Prometheus+Grafana监控平台+钉钉报警
提示: 常用变量可放在 .env 里,这里就不用了
准备工作
mkdir /home/prom/{data,services} -p
mkdir /home/prom/services/{alertmanager,prometheus,dingtalk} -p
mkdir /home/prom/data/{prometheus,grafana}
mkdir /home/prom/services/prometheus/rules
chmod 777 /home/prom/data -R
目录详情
[root@k8s-node01 prom]# tree
.
├── data
│ ├── grafana
│ └── prometheus
├── docker-compose.yml
└── services
├── alertmanager
│ └── alertmanager.yml
├── dingtalk
│ ├── config.yml
│ └── dingding.tmpl
└── prometheus
├── prometheus.yml
└── rules
└── alert-rules.yml
cat services/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xxx@163.com'
smtp_auth_username: 'xxxx@163.com'
smtp_auth_password: 'aabb1122'
smtp_require_tls: false
route:
receiver: 'dingding'
group_wait: 10s
group_interval: 1m
repeat_interval: 1h
group_by: ['alertname']
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
receivers:
- name: 'email'
email_configs:
- to: 'xxxx@163.com'
send_resolved: true
- name: 'dingding'
webhook_configs:
- url: 'http://dingtalk:8060/dingtalk/webhook/send'
send_resolved: true
cat services/dingtalk/config.yml
templates:
- /etc/prometheus-webhook-dingtalk/templates/dingdiang.tmpl
targets:
webhook:
url: https://oapi.dingtalk.com/robot/send?access_token=xxxaaxxxxaaxxxxxxxxxxxxxx
mention:
all: true
cat services/dingtalk/dingding.tmpl
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
告警程序: prometheus_alert
告警级别: {{ .Labels.severity }}
告警类型: {{ .Labels.alertname }}
告警主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "__text_resolve_list" }}{{ range . }}
恢复程序: {{ .Labels.alertname }}
恢复主机: {{ .Labels.instance }}
恢复详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
#### [{{ .Alerts.Firing | len }}]【Centos 报警触发】
![警报 图标](https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3626076420,1196179712&fm=15&gp=0.jpg)
**====侦测到故障====**
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}#### [{{ .Alerts.Resolved | len }}]【Centos 报警恢复】
{{ template "__text_resolve_list" .Alerts.Resolved }}{{ end }}
{{ end }}
cat services/prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- "*rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['prometheus:9090']
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'alertmanager'
static_configs:
- targets: ['alertmanager:9093']
cat services/prometheus/rules/alert-rules.yml
groups:
- name: Node_exporter Down
rules:
- alert: Node实例已宕机
expr: up == 0
for: 10s
labels:
user: root
severity: Warning
annotations:
summary: "{{ $labels.job }}"
address: "{{ $labels.instance }}"
description: "Node_exporter 客户端在1分钟内连接失败."
docker-compose.yml
version: '3.7'
services:
node-exporter:
image: prom/node-exporter:latest
ports:
- "9100:9100"
networks:
- prom
dingtalk:
image: timonwong/prometheus-webhook-dingtalk:latest
volumes:
- $PWD/services/dingtalk/config.yml:/etc/prometheus-webhook-dingtalk/config.yml:ro
- $PWD/services/dingtalk/dingding.tmpl:/etc/prometheus-webhook-dingtalk/templates/dingdiang.tmpl:ro
ports:
- "8060:8060"
networks:
- prom
alertmanager:
depends_on:
- dingtalk
image: prom/alertmanager:latest
volumes:
- $PWD/services/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
ports:
- "9093:9093"
- "9094:9094"
networks:
- prom
prometheus:
depends_on:
- alertmanager
image: prom/prometheus:latest
volumes:
- $PWD/services/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- $PWD/services/prometheus/rules/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro
- $PWD/data/prometheus/:/prometheus:rw
ports:
- "9090:9090"
networks:
- prom
grafana:
depends_on:
- prometheus
image: grafana/grafana:latest
volumes:
- $PWD/data/grafana/:/var/lib/grafana:rw
ports:
- "3000:3000"
networks:
- prom
networks:
prom:
driver: bridge
启动
docker-compose up -d
Last updated