prometheus

exporter采集

node

docker run -d --name=node-exporter -p 9100:9100 prom/node-exporter

报警媒介

alertmanager

docker run -d -p 9093:9093 \
--name alertmanager \
-v /home/chenqionghe/promethues/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
prom/alertmanager
  • alertmanager.yml

    global:
    resolve_timeout: 5m
    route:
    group_by: ['alertname'] # 报警组
    group_wait: 30s         # 组报警等待时间
    group_interval: 5m      # 组报警间隔时间
    repeat_interval: 30m    # 重复报警间隔时间
    receiver: 'web.hook'
    receivers:
    - name: 'web.hook'
      webhook_configs:
        - url: 'https://oapi.dingtalk.com/robot/send?access_token=4df1f47707ea1293c3760f67368a771adc15e90007e7028fc94bfbc217a6c6fd'
    inhibit_rules:
    - source_match:
        severity: 'critical'
      target_match:
        severity: 'warning'
      equal: ['alertname', 'dev', 'instance']

Prometheus主配置

# cat prometheus.yml
# 我的全局设置
global:
  scrape_interval:     5s # 默认抓取间隔 1m, 设置15秒向目标抓取一次数据。
  evaluation_interval: 5s # 默认每分钟评估 rules 规则,设置 15s
  external_labels:
    monitor: 'codelab-monitor'


# 加载规则一次,并根据 'evaluation_interval'.全局设置定期去评估
rule_files:
  - "rules.yml"
  # - "second_rules.yml"


# Prometheus
scrape_configs:
  # 作业名作为标签 "job= <作业名>" 添加到从该配置中抓取的任何时间序列中。
  - job_name: 'prometheus'

    scrape_interval: 5s # 重写了全局抓取间隔时间,由15秒重写成5秒
    # 以 http 协议去请求,默认请求路径 '/metrics'
    static_configs:
      - targets: ['localhost:9090']
      - targets: ['10.11.9.248:9100']
        labels:
          group: 'client-node-exporter'


# 报警 alertmanager 地址
alerting:
  alertmanagers:
    - static_configs:
        - targets: ["10.11.9.248:9093"]

Prometheus规则

# cat rules.yml  
groups:
  - name: alert
    rules:
      - alert: NodeDown
        expr: up == 0
        for: 30s
        labels:
          severity: critical
          instance: "{{ $labels.instance }}"
        annotations:
          summary: "instance: {{ $labels.instance }} down"
          description: "instance: {{ $labels.instance }} 已经宕机 1分钟"
          value: "{{ $value }}"
# 启动时加上--web.enable-lifecycle启用远程热加载配置文件: 调用指令是curl -X POST http://localhost:9090/-/reload
docker run --name=prometheus -d \
-p 9090:9090 \
-v /home/promethues/server/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /home/promethues/server/rules.yml:/etc/prometheus/rules.yml \
prom/prometheus

# http://10.211.55.25:9090/
# http://10.211.55.25:9090/metrics 详细监控信息

参考

https://www.e-learn.cn/topic/4007819 https://juejin.cn/post/6923558840285790215 https://blog.51cto.com/u_12965094/2687946

Last updated