监控-平台搭建+邮件报警
准备工作
mkdir /data/common/{conf,template,rules} -p
cd /data
# 提前看下自定义目录
[root@k8s-node02 data]# pwd
/data
[root@k8s-node02 data]# ls
alertmanager common grafana node_exporter prometheus
[root@k8s-node02 data]# tree common/
common/
├── conf
│ ├── alertmanager.yml
│ └── prometheus.yml
├── rules
│ └── node.yml
└── template
└── email.tmpl
3 directories, 4 files
Prometheus 安装
# 默认端口: 8090,可访问: http://IP:8090
wget -c https://github.com/prometheus/prometheus/releases/download/v2.17.1/prometheus-2.17.1.linux-amd64.tar.gz
tar xf prometheus-2.17.1.linux-amd64.tar.gz
mv prometheus-2.17.1.linux-amd64 prometheus
cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Documentation=Prometheus Monitoring System
Description=Prometheus
After=network.target
[Service]
ExecStart=/data/prometheus/prometheus \
--config.file=/data/common/conf/prometheus.yml \
--web.enable-admin-api \
--web.listen-address=:8090 \
--log.format=json
[Install]
WantedBy=multi-user.target
EOF
node_exporter 安装
# 负责收集信息, 默认端口: 9100, 可访问 http://IP:9100/metrics
wget -c https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar fx node_exporter-0.18.1.linux-amd64.tar.gz
mv node_exporter-0.18.1.linux-amd64 node_exporter
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
[Service]
User=root
ExecStart=/data/node_exporter/node_exporter
[Install]
WantedBy=default.target
EOF
Grafana 安装
wget -c https://dl.grafana.com/oss/release/grafana-7.5.5.linux-amd64.tar.gz
tar -zxvf grafana-7.5.5.linux-amd64.tar.gz
mv grafana-7.5.5 grafana
cat > /usr/lib/systemd/system/grafana-server.service <<EOF
[Unit]
Description=Grafana
After=network.target
[Service]
Type=notify
ExecStart=/data/grafana/bin/grafana-server -homepath /data/grafana
Restart=on-failure
[Install]
WantedBy=multi-user.targe
EOF
Alertmanager 安装
wget -c https://github.com/prometheus/alertmanager/releases/download/v0.20.0/alertmanager-0.20.0.linux-amd64.tar.gz
tar xf alertmanager-0.20.0.linux-amd64.tar.gz
mv alertmanager-0.20.0.linux-amd64 alertmanager
cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=alertmanager
Documentation=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=root
ExecStart=/data/alertmanager/alertmanager --config.file=/data/common/conf/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
查看 prometheus 配置文件
[root@k8s-node02 data]# cat > common/conf/prometheus.yml << EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
rule_files:
- "/data/common/rules/*.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9100']
EOF
查看 alertmanager.yml
[root@k8s-node02 data]# cat > common/conf/alertmanager.yml << EOF
global:
resolve_timeout: 5m
# 邮件SMTP配置
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'alizabbix@163.com'
smtp_auth_username: 'alizabbix@163.com'
smtp_auth_password: 'aabb1122'
smtp_require_tls: false
# 自定义通知模板
templates:
- '/data/common/template/email.tmpl'
# route用来设置报警的分发策略
route:
# 采用哪个标签来作为分组依据
group_by: ['alertname']
# 组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
group_wait: 10s
# 两组告警的间隔时间
group_interval: 10s
# 重复告警的间隔时间,减少相同邮件的发送频率
repeat_interval: 1h
# 设置默认接收人
receiver: 'email'
routes: # 可以指定哪些组接手哪些消息
- receiver: 'email'
continue: true
group_wait: 10s
receivers:
- name: 'email'
email_configs:
- to: 'alizabbix@163.com'
html: '{{ template "email.to.html" . }}'
headers: { Subject: "Prometheus [Warning] 报警邮件" }
send_resolved: true
##配置告警收敛,
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
EOF
告警收敛 当我收到一个告警级别为 critical 时,他就会抑制掉 warning 这个级别的告警,这个告警等级是在你编写规则的时候定义的,最后一行就是要对哪些告警做抑制,通过标签匹配的,我这里只留了一个 instance,举个最简单的例子,当现在 alertmanager 先收到一条 critical、又收到一条 warning 且 instance 值一致的两条告警他的处理逻辑是怎样的。 例如 在监控 nginx,nginx 宕掉的告警级别为 warning,宿主机宕掉的告警级别为 critical,譬如说现在我跑 nginx 的服务器凉了,这时候 nginx 肯定也凉了,普罗米修斯发现后通知 alertmanager,普罗米修斯发过来的是两条告警信息,一条是宿主机凉了的,一条是 nginx 凉了的,alertmanager 收到之后,发现告警级别一条是 critical,一条是 warning,而且 instance 标签值一致,也就是说这是在一台机器上发生的,所以他就会只发一条 critical 的告警出来,warning 的就被抑制掉了,我们收到的就是服务器凉了的通知
查看告警邮件摸版
[root@k8s-node02 data]# cat > common/template/email.tmpl << EOF
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
告警主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
=========end==========<br>
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
告警主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
=========end==========<br>
{{ end }}{{ end -}}
{{- end }}
EOF
查看告警规则
[root@k8s-node02 data]# cat common/rules/node.yml
groups:
- name: Node_exporter Down
rules:
- alert: Node实例已宕机
expr: up == 0
for: 10s
labels:
user: root
severity: Warning
annotations:
summary: "{{ $labels.job }}"
address: "{{ $labels.instance }}"
description: "Node_exporter 客户端在1分钟内连接失败."
启动并开机自启
systemctl start prometheus node_exporter grafana-server alertmanager
systemctl enable prometheus node_exporter grafana-server alertmanager
访问: http://192.168.1.136:8090 # Prometheus Web页面
访问: http://192.168.1.136:9100/metrics # Node 收集的信息
访问: http://192.168.1.136:3000 # Gafana Web 页面
测试停止数据收集
systemct stop node_exporter
# 查看是否收到报警信息
Grafana 摸版查找
https://grafana.com/grafana/dashboards
Last updated