prometheus监控告警部署

为了方便，这里使用docker-compose进行部署。

1.prometheus部署

docker-compose编排文件：

version: '3.2'
services:
  prom01:
    image: prom/prometheus:v2.19.3
    container_name: prom01
    hostname: prom01
    restart: on-failure
    user: root #数据映射到主机上可能需要开启root权限
    command:
      - '--web.enable-lifecycle' #开启配置reload功能
      - '--config.file=/etc/prometheus/prometheus.yml' #指定配置文件
      - '--storage.tsdb.retention.time=30d' #数据保存时间
      - '--storage.tsdb.path=/prometheus/data' #指定数据保存位置
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
    volumes:
      - ./config:/etc/prometheus
      - ./data:/prometheus/data:rw
      - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
    ports:
      - "9090:9090"

之前部署prometheus2.20.x以上版本好像出现过一个timeout问题，然后将版本回溯到2.19.3版本。

config/prometheus.yml配置文件

global:
  # 默认抓取周期，可用单位ms、smhdwy #设置每15s采集数据一次，默认1分钟
  scrape_interval: 30s
  # 告警规则的执行周期 # 每15秒计算一次规则。默认1分钟
  evaluation_interval: 30s
  # 默认抓取超时
  scrape_timeout: 10s

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:  # 设定alertmanager和prometheus交互的接口，即alertmanager监听的ip地址和端口
            - 'alertmanager:9093'

# rule配置，首次读取默认加载，之后根据evaluation_interval设定的周期加载
# 联邦的情况下，子节点可以不配置<alerting>和<rule_files>，在汇总中心配置即可，
# 也可以根据需求对过滤掉的子节点监控信息进行告警配置。
rule_files:
  - "./rules/*.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  - job_name: 'prometheus' # job_name默认写入timeseries的labels中，可以用于查询使用
    scrape_interval: 30s # 抓取周期，默认采用global配置
    static_configs: # 静态配置
      - targets: ['prometheus:9090'] # prometheus所要抓取数据的地址，即instance实例项

使用docker-compose -f docker-compose.yml up -d启动容器。

2.alertmanager部署

docker-compose编排文件

version: '3.2'
services:
  alert01:
    image: prom/alertmanager:v0.21.0
    container_name: alert01
    hostname: alert01
    user: root
    restart: on-failure
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/etc/alertmanager/data'
    volumes:
      - ./config/alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - ./config/tmpl:/etc/alertmanager/config
      - ./data:/etc/alertmanager/data:rw
      - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
    ports:
      - "9093:9093"

config/alertmanager.yml配置文件

global:
  # 持续5分钟没收到告警信息后认为问题已解决
  resolve_timeout: 5m
  # smtp config
  # 发件者邮箱
  smtp_from: 'xxx@163.com'
  # 发件者的SMTP地址 'smtp.qq.com:465'
  smtp_smarthost: 'smtp.163.com:25'
  smtp_auth_username: 'xxx@163.com'
  # 邮箱专用授权码
  smtp_auth_password: 'xxxxxx'
  # 关闭tls授权
  smtp_require_tls: false
templates:
  - '/etc/alertmanager/config/*.tmpl'
# route标记：告警如何发送分配
route:
  # 定义谁来通知报警
  receiver: dev-receiver
  # 分组等待的时间
  group_wait: 30s
  # 用于定义相同的Group之间发送告警通知的时间间隔，即在队列中排队的告警，每间隔5分钟就发送出去
  group_interval: 5m
  # 重复发送告警时间。默认4h，对于email配置中，此项不可以设置过低，否则将会由于邮件发送太多频繁，被smtp服务器拒绝
  repeat_interval: 12h
  # 根据哪个标签进行分组
  group_by: [alertname]
  # 路由分组
  routes:
    # 开发组
    - receiver: dev-receiver
      group_wait: 30s
      match:
        env: dev|test  # labels匹配

#    - receiver: database-pager
#      group_wait: 10s
#      match_re:
#        service: mysql|cassandra

receivers:
  - name: dev-receiver
#    email_configs:
#      - to: 'xxx@163.com'
#        send_resolved: true
#        headers: { Subject: '[dev] 报警邮件'}
    webhook_configs:
      # 钉钉机器人告警
      - url: http://dingding-webhook:8060/dingtalk/webhook_mention_users/send
        send_resolved: true
      # 企业微信机器人告警
      # - url: http://wechat-webhook:8060/dingtalk/webhook2/send
      #   send_resolved: true
    wechat_configs:
      - send_resolved: true # 告警恢复发送
        # 企业信息(我的企业-->CorpId[在底部])
        corp_id: 'xxx'
        # 接收组的id
        to_party: '2'
        # (企业微信-->自定应用-->AgentId)
        agent_id: 'xxx'
        # 企业微信(企业微信-->自定应用-->Secret)
        api_secret: 'xxx'
        # 发送消息模板的设定，模板中定义的define字段
        # message: '{{ template 'wechat.default.message' . }}'

使用docker-compose -f docker-compose.yml up -d启动容器。

3.grafana部署

docker-compose编排脚本

  grafana:
    image: grafana/grafana:7.1.0
    container_name: grafana
    hostname: grafana
    restart: on-failure
    user: root
    environment:
      - "GF_AUTH_ANONYMOUS_ENABLED=true"
      - "GF_SECURITY_ALLOW_EMBEDDING=true"
      - "GF_SECURITY_ADMIN_PASSWORD=admin"
      - "TZ=Asia/Shanghai"
    volumes:
      - ./data:/var/lib/grafana:rw
    ports:
      - "3000:3000"

使用docker-compose -f docker-compose.yml up -d启动容器。

4.配置

部署成功后，登录grafana页面，在数据源里面添加prometheus就可以查看收集的指标数据了。