告警-钉钉
首先 已有prometheus 、alertmanager、node_exporter 环境 其次监控规则已经写好
依赖 go 环境
wget https://studygolang.com/dl/golang/go1.14.2.linux-amd64.tar.gz
tar xf go1.14.2.linux-amd64.tar.gz
mv go1.14.2.linux-amd64 /usr/local/go
cat << EOF >> /etc/profile
export GO_HOME=/usr/local/go
export PATH=\$PATH:\$GO_HOME/bin
EOF
source /etc/profile
依赖 node > 8
yum -y install epel-release
wget https://dl.yarnpkg.com/rpm/yarn.repo -O /etc/yum.repos.d/yarn.repo
yum -y install yarn
wget https://nodejs.org/dist/v10.13.0/node-v10.13.0-linux-x64.tar.xz
tar xf node-v10.13.0-linux-x64.tar.xz
mv node-v10.13.0-linux-x64 /usr/local/nodejs
cat << EOF >> /etc/profile
export NODE_HOME=/usr/local/nodejs
export PATH=\$PATH:\$NODE_HOME/bin
EOF
source /etc/profile
编译 prometheus-webhook-dingtalk
git clone https://github.com/timonwong/prometheus-webhook-dingtalk.git
mv prometheus-webhook-dingtalk /data/dingtalk
cd /data/dingtalk
make
echo $?
# 若返回 0, 编译成功
做成服务
cat > /usr/lib/systemd/system/prometheus-webhook-dingtalk.service << EOF
[Unit]
Description=prometheus-webhook-dingtalk
After=network-online.target
[Service]
Restart=on-failure
ExecStart=/data/dingtalk/prometheus-webhook-dingtalk --template.file=/data/common/template/dingding.tmpl --ding.profile=ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=此处省略,输入自己的Webhook地址
[Install]
WantedBy=multi-user.target
EOF
钉钉告警摸版
[root@k8s-node02 template]# cat dingding.tmpl
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
告警程序: prometheus_alert
告警级别: {{ .Labels.severity }}
告警类型: {{ .Labels.alertname }}
告警主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "__text_resolve_list" }}{{ range . }}
恢复程序: {{ .Labels.alertname }}
恢复主机: {{ .Labels.instance }}
恢复详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
#### [{{ .Alerts.Firing | len }}]【Centos 报警触发】
![警报 图标](https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3626076420,1196179712&fm=15&gp=0.jpg)
**====侦测到故障====**
{{ template "__text_alert_list" .Alerts.Firing }}
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}#### [{{ .Alerts.Resolved | len }}]【Centos 报警恢复】
{{ template "__text_resolve_list" .Alerts.Resolved }}{{ end }}
{{ end }}
查看是否启动
systemctl daemon-reload
systemctl start prometheus-webhook-dingtalk
netstat -lntup | grep 8060
指定钉钉告警方式
# alertmanager 服务告警规则,需要指定 钉钉报警,并重启
cat alertmanager.yml
global:
resolve_timeout: 2m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: ‘http://localhost:8060/dingtalk/ops_dingding/send’
send_resolved: true
效果图
Last updated