docker-compose.yml配置如下
version: '3.7'
networks:
monitor:
driver: bridge
services:
prometheus:
image: prom/prometheus
container_name: prometheus
hostname: prometheus
restart: always
volumes:
- /data/conf/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- /data/conf/prometheus/node_down.yml:/etc/prometheus/node_down.yml
ports:
- "9090:9090"
networks:
- monitor
alertmanager:
image: prom/alertmanager
container_name: alertmanager
hostname: alertmanager
restart: always
volumes:
- /data/conf/prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
networks:
- monitor
grafana:
image: grafana/grafana
container_name: grafana
hostname: grafana
restart: always
ports:
- "3000:3000"
networks:
- monitor
node-exporter:
image: quay.io/prometheus/node-exporter
container_name: node-exporter
hostname: node-exporter
restart: always
ports:
- "9100:9100"
networks:
- monitor
cadvisor:
image: google/cadvisor:latest
container_name: cadvisor
hostname: cadvisor
restart: always
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /data/docker_containers/:/var/lib/docker:ro
ports:
- "8080:8080"
networks:
- monitor
特别注意下面这个映射关系,/data/docker_containers/这个目录,是我修改了的docker存储目录
/data/docker_containers/:/var/lib/docker:ro
centos7在/usr/lib/systemd/system/docker.service文件里
ExecStart=/usr/bin/dockerd --graph=/data/docker_containers -H fd:// --containerd=/run/containerd/containerd.sock
prometheus.yml文件内容:
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['172.1.5.220:9093']
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "node_down.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['172.1.5.220:9090']
- job_name: 'cadvisor'
static_configs:
- targets: ['172.1.5.220:8080']
- job_name: 'node'
scrape_interval: 8s
static_configs:
- targets: ['172.1.5.220:9100']
node_down.yml文件内容:
groups:
- name: node_down
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
user: test
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
alertmanager.yml文件内容,即配置自己的邮件服务器,和接收人邮箱,各位对应修改即可
global:
smtp_smarthost: 'smtp.xxx.com:25'
smtp_from: 'user@xxx.com'
smtp_auth_username: 'user@xxx.com'
smtp_auth_password: 'pwd123456'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: live-monitoring
receivers:
- name: 'live-monitoring'
email_configs:
- to: 'xxx@qq.com'
把容器跑起来后,通过http://172.1.5.220:9090/targets
即可查看状态
从图中可以看出当前为UP状态(健康状态),一开始,全是DOWN状态的,跟防火墙设置有关
[root@172-1-5-220 ~]# setenforce 0
[root@172-1-5-220 ~]# firewall-cmd --zone=public --list-ports
[root@172-1-5-220 ~]# firewall-cmd --zone=public --add-port=9100/tcp --permanent
success
[root@172-1-5-220 ~]# firewall-cmd --zone=public --add-port=9090/tcp --permanent
success
[root@172-1-5-220 ~]# firewall-cmd --zone=public --add-port=9093/tcp --permanent
success
[root@172-1-5-220 ~]# firewall-cmd --zone=public --add-port=3000/tcp --permanent
success
[root@172-1-5-220 ~]# firewall-cmd --zone=public --add-port=8080/tcp --permanent
success
[root@172-1-5-220 ~]# firewall-cmd --reload
success
接下来,登录Grafana,添加prometheus数据源
导入看板后,我们点击左上角Granfana图标,再点击Home,再选择我们导入的Prometheus 2.0 Stats就可以看到效果了
我们来试一下另一个更好看点的看板
简单的安装到此为止,有了环境,接下来就可以好好学习Prometheus了,哈哈