监控-平台搭建+邮件报警

准备工作

mkdir /data/common/{conf,template,rules} -p
cd /data

# 提前看下自定义目录
[root@k8s-node02 data]# pwd
/data
[root@k8s-node02 data]# ls
alertmanager  common  grafana  node_exporter  prometheus
[root@k8s-node02 data]# tree common/
common/
├── conf
   ├── alertmanager.yml
   └── prometheus.yml
├── rules
   └── node.yml
└── template
    └── email.tmpl

3 directories, 4 files

Prometheus 安装

# 默认端口: 8090,可访问: http://IP:8090
wget -c https://github.com/prometheus/prometheus/releases/download/v2.17.1/prometheus-2.17.1.linux-amd64.tar.gz
tar xf prometheus-2.17.1.linux-amd64.tar.gz
mv prometheus-2.17.1.linux-amd64 prometheus


cat > /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Documentation=Prometheus Monitoring System
Description=Prometheus
After=network.target

[Service]
ExecStart=/data/prometheus/prometheus   \
          --config.file=/data/common/conf/prometheus.yml   \
          --web.enable-admin-api   \
          --web.listen-address=:8090   \
          --log.format=json
 
[Install]
WantedBy=multi-user.target
EOF

node_exporter 安装

# 负责收集信息, 默认端口: 9100, 可访问 http://IP:9100/metrics
wget -c https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar fx node_exporter-0.18.1.linux-amd64.tar.gz 
mv node_exporter-0.18.1.linux-amd64 node_exporter

cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
[Service]
User=root
ExecStart=/data/node_exporter/node_exporter
[Install]
WantedBy=default.target
EOF

Grafana 安装

wget -c https://dl.grafana.com/oss/release/grafana-7.5.5.linux-amd64.tar.gz
tar -zxvf grafana-7.5.5.linux-amd64.tar.gz
mv grafana-7.5.5 grafana

cat > /usr/lib/systemd/system/grafana-server.service <<EOF
[Unit]
Description=Grafana
After=network.target

[Service]
Type=notify
ExecStart=/data/grafana/bin/grafana-server -homepath /data/grafana
Restart=on-failure

[Install]
WantedBy=multi-user.targe

EOF

Alertmanager 安装

wget -c https://github.com/prometheus/alertmanager/releases/download/v0.20.0/alertmanager-0.20.0.linux-amd64.tar.gz
tar xf alertmanager-0.20.0.linux-amd64.tar.gz
mv alertmanager-0.20.0.linux-amd64 alertmanager

cat > /usr/lib/systemd/system/alertmanager.service << EOF
[Unit]
Description=alertmanager
Documentation=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=root
ExecStart=/data/alertmanager/alertmanager --config.file=/data/common/conf/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

查看 prometheus 配置文件

[root@k8s-node02 data]# cat > common/conf/prometheus.yml << EOF
global:
  scrape_interval:     15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
  - static_configs:
    - targets: ['localhost:9093']

rule_files:
  - "/data/common/rules/*.yml"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9100']
EOF

查看 alertmanager.yml

[root@k8s-node02 data]# cat > common/conf/alertmanager.yml << EOF
global:
  resolve_timeout: 5m
  # 邮件SMTP配置
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: 'alizabbix@163.com'
  smtp_auth_username: 'alizabbix@163.com'
  smtp_auth_password: 'aabb1122'
  smtp_require_tls: false
# 自定义通知模板
templates:
  - '/data/common/template/email.tmpl'
# route用来设置报警的分发策略
route:
  # 采用哪个标签来作为分组依据
  group_by: ['alertname']
  # 组告警等待时间。也就是告警产生后等待10s,如果有同组告警一起发出
  group_wait: 10s
  # 两组告警的间隔时间
  group_interval: 10s
  # 重复告警的间隔时间,减少相同邮件的发送频率
  repeat_interval: 1h
  # 设置默认接收人
  receiver: 'email'
  routes:   # 可以指定哪些组接手哪些消息
  - receiver: 'email'
    continue: true
    group_wait: 10s
receivers:
- name: 'email'
  email_configs:
  - to: 'alizabbix@163.com'
    html: '{{ template "email.to.html" . }}'
    headers: { Subject: "Prometheus [Warning] 报警邮件" }
    send_resolved: true

##配置告警收敛,
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
EOF

告警收敛 当我收到一个告警级别为 critical 时,他就会抑制掉 warning 这个级别的告警,这个告警等级是在你编写规则的时候定义的,最后一行就是要对哪些告警做抑制,通过标签匹配的,我这里只留了一个 instance,举个最简单的例子,当现在 alertmanager 先收到一条 critical、又收到一条 warning 且 instance 值一致的两条告警他的处理逻辑是怎样的。 例如 在监控 nginx,nginx 宕掉的告警级别为 warning,宿主机宕掉的告警级别为 critical,譬如说现在我跑 nginx 的服务器凉了,这时候 nginx 肯定也凉了,普罗米修斯发现后通知 alertmanager,普罗米修斯发过来的是两条告警信息,一条是宿主机凉了的,一条是 nginx 凉了的,alertmanager 收到之后,发现告警级别一条是 critical,一条是 warning,而且 instance 标签值一致,也就是说这是在一台机器上发生的,所以他就会只发一条 critical 的告警出来,warning 的就被抑制掉了,我们收到的就是服务器凉了的通知

查看告警邮件摸版

[root@k8s-node02 data]# cat > common/template/email.tmpl  << EOF
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
告警主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }}  <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
=========end==========<br>
{{ end }}{{ end -}}
 
{{- if gt (len .Alerts.Resolved) 0 -}}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} <br>
告警类型: {{ .Labels.alertname }} <br>
告警主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }} <br>
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }} <br>
=========end==========<br>
{{ end }}{{ end -}}
 
{{- end }}
EOF

查看告警规则

[root@k8s-node02 data]# cat common/rules/node.yml
groups:
- name: Node_exporter Down
  rules:
  - alert: Node实例已宕机
    expr: up == 0
    for: 10s
    labels:
      user: root
      severity: Warning
    annotations:
      summary: "{{ $labels.job }}"
      address: "{{ $labels.instance }}"
      description: "Node_exporter 客户端在1分钟内连接失败."

启动并开机自启

systemctl start prometheus node_exporter grafana-server alertmanager
systemctl enable prometheus node_exporter grafana-server alertmanager    
访问: http://192.168.1.136:8090             # Prometheus Web页面
访问: http://192.168.1.136:9100/metrics     # Node 收集的信息
访问: http://192.168.1.136:3000             # Gafana Web 页面

测试停止数据收集

systemct stop node_exporter

# 查看是否收到报警信息

Grafana 摸版查找

https://grafana.com/grafana/dashboards

Last updated