Prometheus+Grafana监控平台+钉钉报警

FROM alpine:3.11

RUN apk update \
    && apk add --no-cache nginx tzdata \
    && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
    && echo "Asia/Shanghai" > /etc/timezone \
    && apk del tzdata \
    && mkdir /run/nginx/ \
    && rm -rf  /var/cache/apk/* 

EXPOSE 80 443

#ENTRYPOINT ["/run.sh"]
CMD ["nginx", "-g", "daemon off;"]
[root@kp ~]# docker build -t mmp:v1 .
Sending build context to Docker daemon  529.9kB
Step 1/4 : FROM alpine:3.11
 ---> d2bcc698e2fc
Step 2/4 : RUN apk update     && apk add --no-cache nginx tzdata     && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime     && echo "Asia/Shanghai" > /etc/timezone     && apk del tzdata     && mkdir /run/nginx/     && rm -rf  /var/cache/apk/*
 ---> Using cache
 ---> d7fa977b40ea
Step 3/4 : EXPOSE 80 443
 ---> Using cache
 ---> 1b5140208a9b
Step 4/4 : CMD ["nginx", "-g", "daemon off;"]
 ---> Using cache
 ---> 7c36301aee5b
Successfully built 7c36301aee5b
Successfully tagged mmp:v1

[root@kp ~]# docker run -d --name mmp mmp:v1

Prometheus+Grafana监控平台+钉钉报警

提示: 常用变量可放在 .env 里,这里就不用了

准备工作

mkdir /home/prom/{data,services} -p
mkdir /home/prom/services/{alertmanager,prometheus,dingtalk} -p
mkdir /home/prom/data/{prometheus,grafana}
mkdir /home/prom/services/prometheus/rules
chmod 777 /home/prom/data -R

目录详情

[root@k8s-node01 prom]# tree
.
├── data
   ├── grafana
   └── prometheus
├── docker-compose.yml
└── services
    ├── alertmanager
       └── alertmanager.yml
    ├── dingtalk
       ├── config.yml
       └── dingding.tmpl
    └── prometheus
        ├── prometheus.yml
        └── rules
            └── alert-rules.yml

cat services/alertmanager/alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: 'xxx@163.com'
  smtp_auth_username: 'xxxx@163.com'
  smtp_auth_password: 'aabb1122'
  smtp_require_tls: false

route:
  receiver: 'dingding'
  group_wait: 10s
  group_interval: 1m
  repeat_interval: 1h
  group_by: ['alertname']

inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'instance']
  
receivers:
- name: 'email'
  email_configs:
  - to: 'xxxx@163.com'
    send_resolved: true
- name: 'dingding'
  webhook_configs:
  - url: 'http://dingtalk:8060/dingtalk/webhook/send'
    send_resolved: true

cat services/dingtalk/config.yml

templates:
  - /etc/prometheus-webhook-dingtalk/templates/dingdiang.tmpl

targets:
  webhook:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxaaxxxxaaxxxxxxxxxxxxxx
    mention:
      all: true

cat services/dingtalk/dingding.tmpl

{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}



{{ define "__text_alert_list" }}{{ range . }}

告警程序: prometheus_alert

告警级别: {{ .Labels.severity  }} 

告警类型: {{ .Labels.alertname }}

告警主机: {{ .Labels.instance }} 

告警主题: {{ .Annotations.summary }} 

告警详情: {{ .Annotations.description }} 

触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} 

------------------------
{{ end }}{{ end }}



{{ define "__text_resolve_list" }}{{ range .  }}

恢复程序: {{ .Labels.alertname }} 

恢复主机: {{ .Labels.instance }} 

恢复详情: {{ .Annotations.description }}

触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}

恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}

------------------------
{{ end }}{{ end }}




{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}


{{ if gt (len .Alerts.Firing) 0 }}
#### [{{ .Alerts.Firing | len }}]【Centos 报警触发】

![警报 图标](https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3626076420,1196179712&fm=15&gp=0.jpg)

**====侦测到故障====**

{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}



{{ if gt (len .Alerts.Resolved) 0 }}#### [{{ .Alerts.Resolved | len }}]【Centos 报警恢复】
{{ template "__text_resolve_list" .Alerts.Resolved }}{{ end }}
{{ end }}

cat services/prometheus/prometheus.yml

global:
  scrape_interval:     15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - alertmanager:9093

rule_files:
  - "*rules.yml"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['prometheus:9090']

  - job_name: 'node'
    static_configs:
    - targets: ['node-exporter:9100']

  - job_name: 'alertmanager'
    static_configs:
    - targets: ['alertmanager:9093']

cat services/prometheus/rules/alert-rules.yml

groups:
- name: Node_exporter Down
  rules:
  - alert: Node实例已宕机
    expr: up == 0
    for: 10s
    labels:
      user: root
      severity: Warning
    annotations:
      summary: "{{ $labels.job }}"
      address: "{{ $labels.instance }}"
      description: "Node_exporter 客户端在1分钟内连接失败."

docker-compose.yml

version: '3.7'
services:
  node-exporter:
    image: prom/node-exporter:latest
    ports:
      - "9100:9100"
    networks:
      - prom

  dingtalk:
    image: timonwong/prometheus-webhook-dingtalk:latest
    volumes:
      - $PWD/services/dingtalk/config.yml:/etc/prometheus-webhook-dingtalk/config.yml:ro
      - $PWD/services/dingtalk/dingding.tmpl:/etc/prometheus-webhook-dingtalk/templates/dingdiang.tmpl:ro
    ports:
      - "8060:8060"
    networks:
      - prom

  alertmanager:
    depends_on:
      - dingtalk
    image: prom/alertmanager:latest
    volumes:
      - $PWD/services/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    ports:
      - "9093:9093"
      - "9094:9094"
    networks:
      - prom

  prometheus:
    depends_on:
      - alertmanager
    image: prom/prometheus:latest
    volumes:
      - $PWD/services/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - $PWD/services/prometheus/rules/alert-rules.yml:/etc/prometheus/alert-rules.yml:ro
      - $PWD/data/prometheus/:/prometheus:rw
    ports:
      - "9090:9090"
    networks:
      - prom

  grafana:
    depends_on:
      - prometheus
    image: grafana/grafana:latest
    volumes:
      - $PWD/data/grafana/:/var/lib/grafana:rw
    ports:
      - "3000:3000"
    networks:
      - prom

networks:
  prom:
    driver: bridge

启动

docker-compose up -d

Last updated