1. 部署Alertmanager
安装包下载
地址1:https://prometheus.io/download/
地址2:https://github.com/prometheus/alertmanager/releases
下载部署Linux版Alertmanager
tar zxf alertmanager-0.22.2.linux-amd64.tar.gz
mv alertmanager-0.22.2 /usr/local/alertmanager
vi /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
指定告警规则文件路径
vi /usr/local/prometheus/prometheus.yml
rule_files:
- "rules/*.yml"
配置告警规则
mkdir /usr/local/prometheus/rules
vi /usr/local/prometheus/rules/general.yml
groups:
- name: general #告警规则组名称
rules:
# 任何实例3分钟内无法访问发出告警
- alert: 主机状态 # 告警规则名称
expr: up == 0 # 基于PromQL的触发条件
for: 3m # 等待评估时间
labels: # 自定义标签
severity: critical
annotations: # 指定附加信息
summary: "{{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }}:job {{ $labels.job }} 已经停止3分钟以上."
- alert: CPU使用率
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) *100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}CPU使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} CPU使用率大于80%,(当前值: {{ $value }})"
- alert: CPU使用率
expr: 100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) *100) > 90
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}CPU使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} CPU使用率大于90%,(当前值: {{ $value }})"
- alert: Memery使用率
expr: (100 - (((node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)/node_memory_MemTotal_bytes) * 100)) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}内存使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} 内存使用率大于80%,(当前值: {{ $value }})"
- alert: Memery使用率
expr: (100 - (((node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)/node_memory_MemTotal_bytes) * 100)) > 90
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}内存使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} 内存使用率大于90%,(当前值: {{ $value }})"
- alert: Disk使用率
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}磁盘使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} 磁盘使用率大于80%,(当前值: {{ $value }})"
- alert: Disk使用率
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}磁盘使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} 磁盘使用率大于90%,(当前值: {{ $value }})"
- alert: Inode使用率
expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}Inode使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} Inode使用率大于80%,(当前剩余: {{ $value }})"
- alert: Inode使用率
expr: node_filesystem_files_free{mountpoint ="/"} / node_filesystem_files{mountpoint ="/"} * 100 < 10
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }}Inode使用率过高"
description: "{{ $labels.instance }}:job {{ $labels.job }} Inode使用率大于90%,(当前剩余: {{ $value }})"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 10m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高!"
description: "{{$labels.instance}}:job {{ $labels.job }} 流入磁盘IO大于60%(目前使用:{{$value}})"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 80
for: 10m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高!"
description: "{{$labels.instance}}:job {{ $labels.job }} 流入磁盘IO大于80%(目前使用:{{$value}})"
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 76800
for: 1m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高!"
description: "{{$labels.instance}}:job {{ $labels.job }}流入网络带宽持续5分钟高于75M. RX带宽使用率{{$value}}"
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高!"
description: "{{$labels.instance}}:job {{ $labels.job }}流入网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 76800
for: 1m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高!"
description: "{{$labels.instance}}:job {{ $labels.job }}流出网络带宽持续5分钟高于75M. RX带宽使用率{{$value}}"
- alert: 网络流出
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 102400
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高!"
description: "{{$labels.instance}}:job {{ $labels.job }}流出网络带宽持续5分钟高于100M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 800
for: 1m
labels:
severity: warning
annotations:
summary: "{{$labels.instance}} TCP_ESTABLISHED太高!"
description: "{{$labels.instance }}:job {{ $labels.job }} TCP_ESTABLISHED大于800%(目前使用:{{$value}}%)"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}} TCP_ESTABLISHED太高!"
description: "{{$labels.instance }}:job {{ $labels.job }} TCP_ESTABLISHED大于1000%(目前使用:{{$value}}%)"
配置邮件告警规则
vi alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'willianlacus@163.com'
smtp_auth_username: 'xxxx@163.com'
smtp_auth_password: 'xxxxxx' ---这里使用邮箱授权码,不是密码
smtp_require_tls: false
route:
group_by: ['alertname'] # 根据告警规则组名进行分组
group_wait: 30s # 分组内第一个告警等待时间,10s内如有第二个告警会合并成一个告警
group_interval: 10s # 发送新告警间隔时间
repeat_interval: 1h # 重复告警间隔发送时间
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: 'xxxx@qq.com'
#抑制规则(意思是多个告警,如果alertname和instance相同,只会报critical级别的高级,不报warning级别的告警)
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
检查告警规则
cd /usr/local/prometheus
./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 1 rule files found
Checking rules/general.yml
SUCCESS: 17 rules found
---没有额外报错说明没问题
启动alertmanager
systemctl start alertmanager
ststemctl enable alertmanager
2. 告警状态
• Inactive:这里什么都没有发生。
• Pending:已触发阈值,但未满足告警持续时间
• Firing:已触发阈值且满足告警持续时间。警报发送给接受者。
浏览器访问alertmanager:
IP:9093
浏览器访问prometheus查看,可以看到前面配置的告警规则和告警状态
3. 测试告警
vi /usr/local/prometheus/rules/general.yml ---把内存warning规则降低到30
(100 - (((node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)/node_memory_MemTotal_bytes) * 100)) > 30
ps aux | grep prometheus ---查看prometheus进程的PID号
kill -HUP prometheus进程PID ---刷新prometheus的配置
查看Status--Rules,可以看到已经修改了
查看Alerts,可以看到有几个变成PENDING状态了
过5分钟收到告警邮件说明成功了