Github地址: https://github.com/prometheus/blackbox_exporter/tree/master
前提
已经拥有存在 prometheus 环境 这里:我已经拥有这个环境,存放目录在 /data
安装blackbox_exporter
具体可以监控哪些: tcp、icmp、http、post, 具体看 blackbox.yml 配置
wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.16.0/blackbox_exporter-0.16.0.linux-amd64.tar.gz
tar -zxvf blackbox_exporter-0.16.0.linux-amd64.tar.gz -C /data
mv /data/blackbox_exporter-0.16.0.linux-amd64 /data/blackbox_exporter
cat > /usr/lib/systemd/system/blackbox_exporter.service << EOF
[Unit]
Description=blackbox_exporter
After=network.target
[Service]
WorkingDirectory=/data/blackbox_exporter
ExecStart=/data/blackbox_exporter/blackbox_exporter \
--config.file=/data/blackbox_exporter/blackbox.yml
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start blackbox_exporter && systemctl enable blackbox_exporter
ss -tunlp |grep 9115
# 可访问 ip:9115 查看
监控主机存活状态
注意: job_name 必须添加到 scrape_configs 下 labels -----> 打上标签 replacement ----> 黑盒服务地址
# scrape_configs: # 若文件存在 scrape_configs,则添加 job_name 内容
- job_name: 'blackbox_icmp_ping'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets:
- 192.168.1.2
- 192.168.1.3
labels:
group: 'icmp_ping'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
监控网站状态
# scrape_configs:
- job_name: 'blackbox_web_status'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- http://baidu.com
- https://baidu.com
- http://google.com
- http://google.com:8080
labels:
group: 'http_status'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
监控主机端口存活状态
# scrape_configs:
- job_name: 'blackbox_tcp_port'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets:
- 1.1.1.1:8080
- 1.1.1.1:6379
labels:
group: 'tcp_port'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 127.0.0.1:9115
接口 post 请求状态检测
首先我们要改一下 post 相关接口的 blackbox.yml 配置,我们自己定义一个模块
[root@prometheus blackbox]# cat blackbox.yml
modules:
http_2xx:
prober: http
http_post_2xx: # 这个模块名称可以自己定义, < 调用的使用,在 prometheus 中写入你自己自定义的模块名称,其他配置一样>
prober: http
http:
method: POST
headers:
Content-Type: application/json #添加头部
body: '{"username":"admin","password":"123456"}' #发送的相关数据,这里我们以登录接口为例
如果主机上百个?
""" 例如原来 prometheus 配置文件
static_configs:
- targets:
- 1.1.1.1:8080
- 1.1.1.1:6379
labels:
group: 'tcp_port'
"""
||
""" 更改后 prometheus 配置文件
file_sd_configs:
- refresh_interval: 10s
files:
- "/data/prometheus/rules/blackbox*.yml"
"""
存放位置
ls /data/prometheus/rules/ icmp 存放文件: blackbox-icmp.yml http 存放文件: blackbox-http.yml tcp 存放文件: blackbox-tcp .yml
格式例如
# icmp 、http、tcp 格式一样, 例如 tcp.yml
[root@prometheus blackbox]# cd /data/prometheus/rules/
[root@prometheus rules]# cat blackbox_tcp.yml
- targets:
- 1.1.1.1:8080
- xxxx
labels:
group: 'tcp_port'
告警规则
icmp、tcp、http、post 监测是否正常可以观察 probe_success 这一指标
probe_success == 0 # 连通性异常
probe_success == 1 # 连通性正常
告警也是判断这个指标是否等于 0,如等于 0 则触发异常报警
定制告警规则
[root@prometheus rules]# cat alert-rules.yml
groups:
- name: blackbox_network_stats
rules:
- alert: blackbox_network_stats
expr: probe_success == 0
for: 1m
labels:
severity: critical
annotations:
summary: "接口/主机/端口 {{ $labels.instance }} 无法连通"
description: "请尽快检测"
大屏显示Grafana
使用摸版编码为: 9965, 可以以此摸版为参考,自定义漂亮的摸版 <个人9965觉得不太好看> 更多摸版:https://grafana.com/grafana/dashboards
# 注意: 使用 9965 需要安装Grafana插件,不然会报错:Panel plugin not found: grafana-piechart-panel
grafana-cli plugins install grafana-piechart-panel