prometheus
exporter采集
node
docker run -d --name=node-exporter -p 9100:9100 prom/node-exporter
报警媒介
alertmanager
docker run -d -p 9093:9093 \
--name alertmanager \
-v /home/chenqionghe/promethues/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
prom/alertmanager
alertmanager.yml
global: resolve_timeout: 5m route: group_by: ['alertname'] # 报警组 group_wait: 30s # 组报警等待时间 group_interval: 5m # 组报警间隔时间 repeat_interval: 30m # 重复报警间隔时间 receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: - url: 'https://oapi.dingtalk.com/robot/send?access_token=4df1f47707ea1293c3760f67368a771adc15e90007e7028fc94bfbc217a6c6fd' inhibit_rules: - source_match: severity: 'critical' target_match: severity: 'warning' equal: ['alertname', 'dev', 'instance']
Prometheus主配置
# cat prometheus.yml
# 我的全局设置
global:
scrape_interval: 5s # 默认抓取间隔 1m, 设置15秒向目标抓取一次数据。
evaluation_interval: 5s # 默认每分钟评估 rules 规则,设置 15s
external_labels:
monitor: 'codelab-monitor'
# 加载规则一次,并根据 'evaluation_interval'.全局设置定期去评估
rule_files:
- "rules.yml"
# - "second_rules.yml"
# Prometheus
scrape_configs:
# 作业名作为标签 "job= <作业名>" 添加到从该配置中抓取的任何时间序列中。
- job_name: 'prometheus'
scrape_interval: 5s # 重写了全局抓取间隔时间,由15秒重写成5秒
# 以 http 协议去请求,默认请求路径 '/metrics'
static_configs:
- targets: ['localhost:9090']
- targets: ['10.11.9.248:9100']
labels:
group: 'client-node-exporter'
# 报警 alertmanager 地址
alerting:
alertmanagers:
- static_configs:
- targets: ["10.11.9.248:9093"]
Prometheus规则
# cat rules.yml
groups:
- name: alert
rules:
- alert: NodeDown
expr: up == 0
for: 30s
labels:
severity: critical
instance: "{{ $labels.instance }}"
annotations:
summary: "instance: {{ $labels.instance }} down"
description: "instance: {{ $labels.instance }} 已经宕机 1分钟"
value: "{{ $value }}"
# 启动时加上--web.enable-lifecycle启用远程热加载配置文件: 调用指令是curl -X POST http://localhost:9090/-/reload
docker run --name=prometheus -d \
-p 9090:9090 \
-v /home/promethues/server/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /home/promethues/server/rules.yml:/etc/prometheus/rules.yml \
prom/prometheus
# http://10.211.55.25:9090/
# http://10.211.55.25:9090/metrics 详细监控信息
参考
https://www.e-learn.cn/topic/4007819 https://juejin.cn/post/6923558840285790215 https://blog.51cto.com/u_12965094/2687946
Last updated