prometheus采集cadvisor数据:
添加cadvisor
root@master:~# vim /usr/local/prometheus/prometheus.yml
- job_name: 'prometheus--cadvisor'
static_configs:
- targets: ['192.168.200.206:8080','192.168.200.207:8080']
重启prometheus:
root@master:~#systemctl restart prometheus
导入镜像
root@master:~# docker load -i cadvisor_v0.33.0.tar.gz
打标签
root@master:~# docker tag gcr.io/google-containers/cadvisor:v0.33.0 harbor.wyh.net/baseimages/cadvisor:v0.33.0
上传镜像
root@master:~# docker push harbor.wyh.net/baseimages/cadvisor:v0.33.0
启动cadvisor容器:
docker run \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
harbor.wyh.net/baseimages/cadvisor:v0.33.0
验证cadvisor web界面:
访问node节点的cadvisor监听端口
grafana添加pod监控模板:
395 893 容器模板ID
395模板
prometheus报警设置:
prometheus触发一条告警的过程:
prometheus--->触发阈值--->超出持续时间--->alertmanager--->分组|抑制|静默--->媒体类型--->邮件|钉钉|微信
等。
分组(group): 将类似性质的警报合并为单个通知。
静默(silences): 是一种简单的特定时间静音的机制,例如:服务器要升级维护可以先设置这个时间段告警静
默。
抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报,可以消除冗余告警
解压
root@master2:/usr/local/src# tar xf alertmanager-0.19.0.linux-amd64.tar.gz
做个软连接
root@master2:/usr/local/src# ln -sv /usr/local/src/alertmanager-0.19.0.linux-amd64 /usr/local/alertmanager
设置启动脚本
root@master2:/usr/local/alertmanager# vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-faiure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
配置alertmanager:
root@master2:/usr/local/alertmanager# cat alertmanager.yml | grep ^[^'#']
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '50589143@qq.com'
smtp_auth_username: '50589143@qq.com'
smtp_auth_password: 'pzjypoauatdvcadh'
smtp_hello: '@qq.com'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 60s
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: '2973707860@qq.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
重启服务
root@master2:/usr/local/alertmanager# systemctl restart alertmanager
查看端口
root@master2:/usr/local/alertmanager# ss -tnl | grep 9093
LISTEN 0 128 *:9093 *:*
验证是否会报警
root@master2:/usr/local/alertmanager# ./amtool alert --alertmanager.url=http://192.168.200.197:9093
Alertname Starts At Summary
配置prometheus报警规则:
root@master:/etc/ansible# vim /usr/local/prometheus/prometheus.yml
8 alerting:
9 alertmanagers:
10 - static_configs:
11 - targets:
12 - 192.168.200.197:9093
13 # - alertmanager:9093
14
15 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
16 rule_files:
17 - "/usr/local/prometheus/rule-linux37.yml"
root@master:/etc/ansible# vim /usr/local/prometheus/rule-linux37.yml
groups:
- name: linux37_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 10
for: 5m
labels:
severity: critical
service: pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 75% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 1024*10^3*2
for: 10m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by (name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1024*1024*50
for: 10m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M , (current value is {{ $value }})
root@master:/etc/ansible# systemctl restart prometheus
修改为%25
root@master:/usr/local/prometheus# vim rule-linux37.yml
11 description: 容器 {{ $labels.name }} CPU 资源利用率大于 25% , (current value is {{ $value }})
root@master:/usr/local/prometheus# systemctl restart prometheus
停止服务
root@master:/usr/local/prometheus# systemctl stop prometheus
root@master2:~# systemctl stop alertmanager.service
prometheus监控haproxy:
部署haproxy_exporter:
root@harbor:/usr/local/src# ln -sv /usr/local/src/haproxy_exporter-0.10.0.linux-amd64 /usr/local/harbor_exporter
启动服务
./haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/admin.sock
root@master:~# vim /usr/local/prometheus/prometheus.yml
- job_name: 'prometheus--haproxy'
static_configs:
- targets: ['192.168.200.200:9101']
root@master:~# systemctl restart prometheus.service