一、kubernetes 二进制部署的prometheus实现服务发现
- kubernetes集群外部署prometheus
# 创建用户并授权
[root@deploy-1 case]# pwd
/k8s-data/yaml/prometheus-case-files/prometheus-files/case
[root@deploy-1 case]# kubectl apply -f case4-prom-rbac.yaml
#查看secrets
[root@deploy-1 case]# kubectl get secrets -n monitoring
NAME TYPE DATA AGE
monitoring-token kubernetes.io/service-account-token 3 32s
#获取token
[root@deploy-1 case]# kubectl describe secrets -n monitoring monitoring-token
#将token放在prometheus服务器上的某个文件
[root@prometheus ~]# vim /usr/local/prometheus/k8s.token
- API Serevr 节点发现
- job_name: 'kubernetes-apiservers-monitor'
kubernetes_sd_configs:
- role: endpoints
api_server: https://10.10.20.17:6443
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
scheme: https
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- source_labels: [__address__]
regex: '(.*):6443'
replacement: '${1}:9100'
target_label: __address__
action: replace
- source_labels: [__scheme__]
regex: https
replacement: http
target_label: __scheme__
action: replace
- node 节点发现
- job_name: 'kubernetes-nodes-monitor'
scheme: http
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
kubernetes_sd_configs:
- role: node
api_server: https://10.10.20.17:6443
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
regex: '(.*)'
replacement: '${1}'
action: replace
target_label: LOC
- source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
regex: '(.*)'
replacement: 'NODE'
action: replace
target_label: Type
- source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
regex: '(.*)'
replacement: 'K8S-test'
action: replace
target_label: Env
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- 指定 namespace 的 pod
- job_name: 'kubernetes-发现指定 namespace 的所有 pod'
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.20.17:6443
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
namespaces:
names:
- myserver
- magedu
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- 指定 Pod 发现条件
- job_name: 'kubernetes-指定发现条件的 pod'
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.20.17:6443
tls_config:
insecure_skip_verify: true
bearer_token_file: /usr/local/prometheus/k8s.token
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- source_labels: [__meta_kubernetes_pod_label_pod_template_hash]
regex: '(.*)'
replacement: 'K8S-test'
action: replace
target_label: Env
image.png
image.png
image.png
二、prometheus 基于consul、file实现服务发现
- consul发现
wget https://releases.hashicorp.com/consul/1.15.2/consul_1.15.2_linux_amd64.zip
unzip consul_1.15.2_linux_amd64.zip
mv consul /usr/local/bin/
mkdir /data/consul
vim /etc/systemd/system/consul.service
[Unit]
Description=consul
After=network.target
[Service]
ExecStart=/usr/local/bin/consul agent -server -bind=0.0.0.0 -dev -client 0.0.0.0 -ui -data-dir=/data/consul -advertise=10.10.20.8
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl start consul
systemctl enable consul
#注册
curl -X PUT -d '{"id": "master1","name": "master1","address": "10.10.20.17","port":9100,"tags": ["master1"],"checks": [{"http": "http://10.10.20.17:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
curl -X PUT -d '{"id": "master3","name": "master3","address": "10.10.20.6","port":9100,"tags": ["master3"],"checks": [{"http": "http://10.10.20.6:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
curl -X PUT -d '{"id": "master2","name": "master2","address": "10.10.20.19","port":9100,"tags": ["master2"],"checks": [{"http": "http://10.10.20.19:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
curl -X PUT -d '{"id": "node1","name": "node1","address": "10.10.20.14","port":9100,"tags": ["node1"],"checks": [{"http": "http://10.10.20.14:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
curl -X PUT -d '{"id": "node3","name": "node3","address": "10.10.20.12","port":9100,"tags": ["node3"],"checks": [{"http": "http://10.10.20.12:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
curl -X PUT -d '{"id": "node2","name": "node2","address": "10.10.20.15","port":9100,"tags": ["node2"],"checks": [{"http": "http://10.10.20.15:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register
#修改prometheus配置文件
vim prometheus.yml
- job_name: consul
honor_labels: true
metrics_path: /metrics
scheme: http
consul_sd_configs:
- server: 10.10.20.8:8500
services: [] #发现的目标服务名称,空为所有服务,可以写 servicea,servcieb,servicec
- server: 10.10.20.8:8500
services: []
- server: 10.10.20.8:8500
services: []
- refresh_interval: 5s #每5秒钟检查一次(默认30秒)
relabel_configs:
- source_labels: ['__meta_consul_tags']
target_label: 'product'
- source_labels: ['__meta_consul_dc']
target_label: 'idc'
- source_labels: ['__meta_consul_service'] #过滤consul本身
regex: "consul"
action: drop
systemctl restart prometheus
image.png
image.png
- file实现服务发现
# 准备json文件
local_node.json
[
{
"targets": [ "localhost:9100" ],
"labels": {
"job": "local-node",
"service": "localhost"
}
}
]
# prometheus配置规则
- job_name: "prometheus"
file_sd_configs:
- files:
- "/usr/local/prometheus/local_node.json"
systemctl restart prometheus
image.png
三、prometheus 监控案例-kube-state-metrics
-kube-state-metrics:通过监听API server生成有关资源对象的状态指标,比如Deployment、Node、Pod,需要注意的是kube-state-metrics使用场景不是用于监控对方是否存活,而是周期性获取目标对象metrics指标数据并在web界面进行显示或被prometheus抓取(如pod状态是running还是terminating、pod的创建时间等),目前kube-state-metrics收集的指标数据可参见官方文档
编写yaml文件,部署kube-state-metrics
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/kube-state-metrics:v2.6.0
ports:
- containerPort: 8080
---
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: kube-system
labels:
app: kube-state-metrics
spec:
type: NodePort
ports:
- name: kube-state-metrics
port: 8080
targetPort: 8080
nodePort: 31666
protocol: TCP
selector:
app: kube-state-metrics
kubectl apply -f kube-state-metrics-deploy.yaml
[root@deploy-1 case]# kubectl get pod -A | grep state
kube-system kube-state-metrics-7c4b576569-xk4nz 1/1 Running 0 97m
[root@deploy-1 case]# kubectl get svc -A| grep state
kube-system kube-state-metrics NodePort 10.100.203.178 <none> 8080:31666/TCP 98m
配置prometheus采集数据
- job_name: 'kube-state-metrics'
static_configs:
- targets: ["10.10.20.12:31666"]
systemctl restart prometheus
image.png
image.png
四、prometheus 监控案例-Tomcat、Redis、Mysql、Haproxy、Nginx
- 监控Tomcat
#下载jar/war包 https://repo1.maven.org/maven2/io/prometheus/
TOMCAT_SIMPLECLIENT_VERSION=0.8.0
TOMCAT_EXPORTER_VERSION=0.0.12
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_hotspot/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_hotspot-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_client/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_client-${TOMCAT_EXPORTER_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_servlet/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_servlet-${TOMCAT_EXPORTER_VERSION}.war
#编写Dockerfile
FROM tomcat:8.5.73
ADD server.xml /usr/local/tomcat/conf/server.xml
RUN mkdir /data/tomcat/webapps -p
ADD myapp /data/tomcat/webapps/myapp
ADD metrics.war /data/tomcat/webapps
ADD simpleclient-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/
EXPOSE 8080 8443 8009
#构建镜像
docker build -t qj.harbor.com/prometheus/tomcat-app1:v1 .
docker push qj.harbor.com/prometheus/tomcat-app1:v1
#编写tomcat的yaml文件
apiVersion: apps/v1
kind: Deployment
metadata:
name: tomcat-deployment
namespace: default
spec:
selector:
matchLabels:
app: tomcat
replicas: 1 # tells deployment to run 2 pods matching the template
template: # create pods using pod definition in this template
metadata:
labels:
app: tomcat
annotations: #添加注解,允许prometheus收集数据
prometheus.io/scrape: 'true'
spec:
containers:
- name: tomcat
image: qj.harbor.com/prometheus/tomcat-app1:v1
imagePullPolicy: Always
ports:
- containerPort: 8080
securityContext:
privileged: true
[root@deploy-1 yaml]# kubectl apply -f tomcat-deploy.yaml
[root@deploy-1 yaml]# kubectl get pod
NAME READY STATUS RESTARTS AGE
tomcat-deployment-555c8594cc-b7hzg 1/1 Running 0 8m33s
[root@deploy-1 yaml]# kubectl get svc
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes ClusterIP 10.100.0.1 <none> 443/TCP 28d
tomcat-service NodePort 10.100.26.30 <none> 80:31080/TCP 93s
[root@deploy-1 yaml]# curl http://172.16.100.64:31080/myapp/
<h1>tomcat app1</h1>
#prometheus 采集并验证数据
[root@prometheus prometheus]# vim prometheus.yml
- job_name: "tomcat-monitor-metrics"
static_configs:
- targets: ["172.16.100.64:31080"]
[root@prometheus prometheus]# systemctl restart prometheus
grafana 导入模板:
https://github.com/nlighten/tomcat_exporter
https://github.com/nlighten/tomcat_exporter/tree/master/dashboard
image.png
image.png
- 监控Redis
#编写yaml文件,部署redis
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: studylinux-net
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:4.0.14
resources:
requests:
cpu: 100m
memory: 100Mi
ports:
- containerPort: 6379
- name: redis-exporter
image: oliver006/redis_exporter:latest
resources:
requests:
cpu: 100m
memory: 100Mi
[root@deploy-1 yaml]# kubectl apply -f redis-deployment.yaml
[root@deploy-1 yaml]# kubectl apply -f redis-exporter-svc.yaml #redis-exporter-svc
[root@deploy-1 yaml]# kubectl apply -f redis-redis-svc.yaml # redis-server-svc
[root@deploy-1 yaml]# kubectl get pod -n studylinux-net
NAME READY STATUS RESTARTS AGE
redis-5b767677fc-74bw5 2/2 Running 0 51s
[root@deploy-1 yaml]# kubectl get svc -n studylinux-net
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
redis-exporter-service NodePort 10.100.51.70 <none> 9121:31082/TCP 49s
redis-redis-service NodePort 10.100.253.21 <none> 6379:31081/TCP 45s
prometheus 采集数据:
- job_name: "redis-monitor-metrics"
static_configs:
- targets: ["172.16.100.64:31082"]
[root@prometheus prometheus]# systemctl restart prometheus
image.png
image.png
image.png
- 监控Mysql
#安装mysql
[root@harproxy ~]# yum -y install mariadb mariadb-server
#创建授权用户
[root@harproxy ~]# mysql
MariaDB [(none)]> CREATE USER 'mysql_exporter'@'localhost' IDENTIFIED BY '123456';
MariaDB [(none)]> GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysql_exporter'@'localhost';
#准备 mysqld_exporter 环境:
[root@harproxy src]# wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.liunx-amd64.tar.gz
[root@harproxy src]# tar -xf mysqld_exporter-0.14.0.linux-amd64.tar.gz
[root@harproxy src]# mv mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter
[root@harproxy ~]# vim /usr/local/mysqld_exporter/my.conf
[client]
user=mysql_exporter
password=123456
[root@harproxy ~]# vim /etc/systemd/system/mysqld_exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/usr/local/mysqld_exporter/mysqld_exporter --config.my-cnf=/usr/local/mysqld_exporter/.my.cnf
[Install]
WantedBy=multi-user.target
[root@harproxy ~]# systemctl daemon-reload
[root@harproxy ~]# systemctl start mysqld_exporter.service
#prometheus 采集数据:
vim prometheus.yml
- job_name: "mysql-monitor-metrics"
static_configs:
- targets: ["172.16.100.80:9104"]
systemctl restart prometheus.service
image.png
image.png
image.png
- 监控Haproxy
#安装Haproxy
[root@harproxy ~]# yum -y install haproxy
#修改haproxy配置,设置sock ,开启状态页
[root@harproxy ~]# vim /etc/haproxy/haproxy.cfg
global
log 127.0.0.1 local0
log 127.0.0.1 local1 notice
chroot /var/lib/haproxy
pidfile /var/run/haproxy.pid
stats socket /var/lib/haproxy/stats.sock mode 660 level admin
stats timeout 30s
user haproxy
group haproxy
daemon
defaults
mode http
log global
#option httplog
option dontlognull
option http-server-close
option redispatch
retries 3
timeout http-request 10s
timeout queue 1m
timeout connect 10s
timeout client 1m
timeout server 1m
timeout http-keep-alive 10s
timeout check 10s
maxconn 3000
listen stats
bind :9999
stats enable
stats uri /haproxy-status
stats auth admin:123456
listen ingress-80
bind 10.10.20.10:80
mode tcp
server 10.10.20.14 10.10.20.14:58561 check inter 3s fall 3 rise 3
server 10.10.20.15 10.10.20.15:58561 check inter 3s fall 3 rise 3
server 10.10.20.12 10.10.20.12:58561 check inter 3s fall 3 rise 3
listen ingress-443
bind 10.10.20.10:443
mode tcp
server 10.10.20.14 10.10.20.14:39257 check inter 3s fall 3 rise 3
server 10.10.20.15 10.10.20.15:39257 check inter 3s fall 3 rise 3
server 10.10.20.12 10.10.20.12:39257 check inter 3s fall 3 rise 3
[root@harproxy ~]# systemctl start haproxy
[root@harproxy ~]# systemctl enable haproxy
#安装haproxy_exporter
[root@harproxy ~]# wget https://github.com/prometheus/haproxy_exporter/releases/download/v0.15.0/haproxy_exporter-0.15.0.linux-amd64.tar.gz
[root@harproxy ~]# tar -xf haproxy_exporter-0.15.0.linux-amd64.tar.gz
[root@harproxy ~]# mv haproxy_exporter-0.15.0.linux-amd64 /usr/local/haproxy_exporter
[root@harproxy ~]# vim /etc/systemd/system/haproxy_exporter.service
[Unit]
Description=Prometheus Haproxy_exporter
After=network.target
[Service]
#sock方式启动
ExecStart=/usr/local/haproxy_exporter/haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/haproxy.sock
#状态页方式启动
#ExecStart=/usr/local/haproxy_exporter/haproxy_exporter --haproxy.scrape-uri=http://admin:123456@127.0.0.1:9999/haproxy-status;csv
[Install]
WantedBy=multi-user.target
[root@harproxy ~]# systemctl daemon-reload
[root@harproxy ~]# systemctl start haproxy_exporter
[root@harproxy ~]# systemctl enable haproxy_exporter
#prometheus 添加 job:
- job_name: "haproxy-monitor-metrics"
static_configs:
- targets: ["172.16.100.80:9101"]
image.png
image.png
image.png
image.png
- 监控Nginx
#编译安装 nginx:
[root@harproxy ~]# git clone https://github.com/vozlt/nginx-module-vts.git
[root@harproxy ~]# wget http://nginx.org/download/nginx-1.20.2.tar.gz
[root@harproxy ~]# tar -xf nginx-1.20.2.tar.gz
[root@harproxy ~]# cd nginx-1.20.2/
[root@harproxy nginx-1.20.2]# ./configure --prefix=/usr/local/nginx --with-http_ssl_module --with-http_v2_module --with-http_realip_module --with-http_stub_status_module --with-http_gzip_static_module --with-pcre --with-file-aio --with-stream --with-stream_ssl_module --with-stream_realip_module --add-module=/root/nginx-module-vts-master/
[root@harproxy nginx-1.20.2]# make && make install
[root@harproxy nginx-1.20.2]# vim /usr/local/nginx/conf/nginx.conf
#http 范围配置:
33 vhost_traffic_status_zone; ##启用状态页
#server 字段配置:
39 location / {
40 root html;
41 index index.html index.htm;
42 proxy_pass http://10.10.20.8:9090; #返现代理 prometheus
43 }
45 location /status {
46 vhost_traffic_status_display;
47 vhost_traffic_status_display_format html;
48 }
[root@harproxy nginx-1.20.2]# /usr/local/nginx/sbin/nginx -t
[root@harproxy nginx-1.20.2]# /usr/local/nginx/sbin/nginx
#安装 nginx exporter
[root@harproxy ~]# wget https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.10.3/nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
[root@harproxy ~]# tar -xf nginx-vts-exporter-0.10.3.linux-amd64.tar.gz
[root@harproxy ~]#mv nginx-vts-exporter-0.10.3.linux-amd64 /usr/local/nginx_exporter
[root@harproxy ~]# vim /etc/systemd/system/nginx_exporter.service
[Unit]
Description=nginx-exporter
After=network.target
[Service]
ExecStart=/usr/local/nginx_exporter/nginx-vts-exporter -nginx.scrape_uri http://10.10.20.10/status/format/json
[Install]
WantedBy=multi-user.target
[root@harproxy ~]# systemctl daemon-reload
[root@harproxy ~]#systemctl start nginx_exporter.service
[root@harproxy ~]#systemctl enbles nginx_exporter.service
[root@prometheus prometheus]# vim prometheus.yml
- job_name: "nginx-monitor-metrics"
static_configs:
- targets: ["172.16.100.80:9913"]
[root@prometheus prometheus]# systemctl restart prometheus
# grafana导入模版2949
image.png
image.png
image.png
五、基于blackbox_exporter实现对URL状态、IP可用性、端口状态、TLS证书的过期时间监控
blackbox_exporter是prometheus官方提供的一个exporter,可以监控HTTP、HTTPS、DNS、TCP、ICMP等目标实例,实现对被监控节点进行监控和数据采集。
- 部署blackbox_exporter
[root@prometheus ~]# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.23.0/blackbox_exporter-0.23.0.linux-amd64.tar.gz
[root@prometheus ~]# tar xvf blackbox_exporter-0.23.0.linux-amd64.tar.gz
[root@prometheus ~]# ln -s /blackbox_exporter-0.23.0.linux-amd64 /usr/local/blackbox_exporter
[root@prometheus ~]# vim /etc/systemd/system/blackbox_exporter.service
[Unit]
Description=blackbox_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml --web.listen-address=:9115
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@prometheus ~]# systemctl daemon-reload
[root@prometheus ~]# systemctl start blackbox_exporter
[root@prometheus ~]# systemctl enable blackbox_exporter
- URL状态和TLS证书过期时间监控
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['http://www.xiaomi.com', 'http://www.magedu.com']
labels:
instance: http_status
group: web
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target] #基于__param_target 获取监控目标
target_label: url #将监控目标的值与 url 创建一个 label
- target_label: __address__
replacement: 127.0.0.1:9115
- IP可用性监控
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets: ['10.10.20.12',"10.10.20.14"]
labels:
instance: 'ping_status'
group: 'icmp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 127.0.0.1:9115
- 端口状态监控
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['10.10.20.10:80', '10.10.20.10:5601','10.10.20.10:3306']
labels:
instance: 'port_status'
group: 'port'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 127.0.0.1:9115
[root@prometheus prometheus]# systemctl restart prometheus
image.png
image.png
image.png
六、prometheus 结合钉钉实现告警通知、企业微信实现告警通知、告警模板的使用、告警分类发送
- Alertmanager
prometheus-->触发阈值-->超出持续时间-->alertmanager-->分组|抑制|静默-->媒体类型-->邮件|钉钉|微信等
分组:将类似性质的警告合并为单个通知,如网络通知、主机通知、服务通知
静默:是一种简单的特定时间静音的机制,如服务器要升级维护可以先设置这个时间段告警静默
抑制:当警告发出后,停止重复发送由此警告引发的其他警告即合并为一个故障引起的多个报警事件,可以消除冗余告警
prometheus 结合钉钉实现告警通知
- 部署alertmanager
[root@prometheus src]# wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
[root@prometheus src]# tar -xf alertmanager-0.24.0.linux-amd64.tar.gz
[root@prometheus src]# mv alertmanager-0.24.0.linux-amd64 /usr/local/alertmanager
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --cluster.advertise-address=0.0.0.0:9093
[root@prometheus src]# systemctl daemon-reload
[root@prometheus src]# systemctl start alertmanager
[root@prometheus src]# systemctl enable alertmanager
- 部署prometheus-webhook-dingtalk
[root@prometheus src]# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@prometheus src]# tar -xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
[root@prometheus src]# mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 /usr/local/prometheus-webhook-dingtalk
[root@prometheus src]# cd /usr/local/prometheus-webhook-dingtalk
[root@prometheus prometheus-webhook-dingtalk]# cp config.example.yml config.yml
[root@prometheus prometheus-webhook-dingtalk]# vim config.yml
timeout: 5s
templates:
- contrib/templates/legacy/template.tmpl
targets:
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b #钉钉群机器人id
secret: SEC000000000000000000000
webhook_legacy:
url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b #钉钉群机器人id
message:
title: '{{ template "legacy.title" . }}'
text: '{{ template "legacy.content" . }}'
webhook_mention_all:
url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b #钉钉群机器人id
mention:
all: true
webhook_mention_users:
url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b #钉钉群机器人id
mention:
mobiles: ['13816457159']
[root@prometheus prometheus-webhook-dingtalk]# vim /etc/systemd/system/prometheus-webhook.service
[Unit]
Description=https://github.com/timonwong/prometheus-webhook-dingtalk/releases/
After=network-online.target
[Service]
Restart=on-failure
ExecStart=/usr/local/prometheus-webhook/prometheus-webhook-dingtalk --config.file=/usr/local/prometheus-webhook/config.yml
[Install]
WantedBy=multi-user.target
[root@prometheus prometheus-webhook-dingtalk]# systemctl daemon-reload
[root@prometheus prometheus-webhook-dingtalk]# systemctl start prometheus-webhook.service
[root@prometheus prometheus-webhook-dingtalk]# systemctl enable prometheus-webhook.service
- 配置prometheus告警规则
[root@prometheus prometheus]# vim prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
rule_files:
- "first_rules.yml"
[root@prometheus prometheus]# cat first_rules.yml
groups:
- name: Linux
rules:
- alert: "内存报警"
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 >85
for: 60s
labels:
severity: warning
annotations:
description: "{{ $labels.instance }} 内存资源利用率大于 85%"
value: "当前内存占用率{{ $value }}%"
summary: "{{ $labels.job }}"
- alert: "CPU报警"
expr: 100 * (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[2m])) by(instance)) > 90
for: 60s
labels:
severity: warning
annotations:
description: "{{ $labels.instance }} CPU报警资源利用率大于 85%"
value: "当前CPU占用率{{ $value }}%"
summary: "{{ $labels.job }}"
- alert: "磁盘报警"
expr: 100 * (node_filesystem_size_bytes{fstype=~"xfs|ext4"} - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 90
for: 60s
labels:
severity: warning
annotations:
description: "磁盘报警资源利用率大于 90%,请及时扩容!"
value: "当前磁盘占用率{{ $value }}%"
summary: "{{ $labels.job }}"
[root@prometheus prometheus]# systemctl restart prometheus
- alertmanager设置钉钉告警
[root@prometheus alertmanager]# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '530387590@qq.com'
smtp_auth_username: '530387590@qq.com'
smtp_auth_password: 'isvfwhvzrkugbgjc'
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'dingding'
receivers:
- name: dingding
webhook_configs:
- url: 'http://127.0.0.1:8060/dingtalk/webhook1/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@prometheus alertmanager]# systemctl restart alertmanager.service
image.png
image.png
image.png
prometheus企业微信实现告警通知
cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '530387590@qq.com'
smtp_auth_username: '530387590@qq.com'
smtp_auth_password: 'isvfwhvzrkugbgjc'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: wechat
receivers:
-- name: 'wechat'
wechat_configs:
- corp_id: ww1fdca58c3d0b1749
to_party: 1
agent_id: 1000004
api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
send_resolved: true
image.png
prometheus告警模板的使用
[root@prometheus alertmanager]# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '530387590@qq.com'
smtp_auth_username: '530387590@qq.com'
smtp_auth_password: 'isvfwhvzrkugbgjc'
smtp_require_tls: false
templates:
- '/usr/local/alertmanager/template.templ' #alertermanager 引用模板
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'dingding'
receivers:
- name: dingding
webhook_configs:
- url: 'http://127.0.0.1:8060/dingtalk/webhook1/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
告警分类发送
- prometheus rules 配置
[root@prometheus prometheus]# cat /usr/local/prometheus/local_linux_local.yml
groups:
- name: localhost-rules
rules:
- alert: "服务器宕机报警"
expr: (up{service="localhost"} == 0) * on(instance) group_left(nodename) (node_uname_info)
for: 3m
labels:
severity: critical
service: localhost
annotations:
description: "服务器已宕机!"
- alert: "内存报警"
expr: (100 - node_memory_MemAvailable_bytes{service="localhost"} / node_memory_MemTotal_bytes{service="localhost"} * 100)* on(instance) group_left(nodename) (node_uname_info) > 90
for: 60s
labels:
severity: warning
service: localhost
annotations:
description: "内存资源利用率大于 90%"
value: "当前内存占用率{{ $value }}%"
- alert: "CPU报警"
expr: 100 * (1 - avg(irate(node_cpu_seconds_total{mode="idle",service="localhost"}[2m])) by (instance)) * on(instance) group_left(nodename) (node_uname_info) > 90
for: 60s
labels:
severity: warning
service: localhost
annotations:
description: "CPU报警资源利用率大于90%"
value: "当前CPU占用率{{ $value }}%"
- alert: "磁盘报警"
expr: 100 * (node_filesystem_size_bytes{fstype=~"xfs|ext4",service="localhost"} - node_filesystem_avail_bytes{service="localhost"}) / node_filesystem_size_bytes{service="localhost"} * on(instance) group_left(nodename) (node_uname_info) > 90
for: 60s
labels:
severity: warning
service: localhost
annotations:
description: "磁盘报警资源利用率大于 90%,请及时扩容!"
value: "当前磁盘占用率{{ $value }}%"
- alertermanager 配置:
[root@prometheus prometheus]# cat /usr/local/alertmanager/alertmanager.yml
[root@prometheus alertmanager]# cat alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: '530387590@qq.com'
smtp_auth_username: '530387590@qq.com'
smtp_auth_password: 'isvfwhvzrkugbgjc'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: 'dingding' #默认告警方式为钉钉
#添加消息路由
routes:
- receiver: 'default-receiver' #critical 级别的通过邮件发送给 leader
group_wait: 10s
match_re:
severity: critical #匹配严重等级告警
- receiver: 'wechat-k8s-worknode' #宿主机告警通过企业微信发送给监控组
group_wait: 10s
match_re:
service: localhost #匹配本机告警
receivers:
- name: 'default-receiver'
email_configs:
- to: '13816457159@163.com'
send_resolved: true
- name: dingding
webhook_configs:
- url: 'http://172.31.2.120:8060/dingtalk/alertname/send'
send_resolved: true
- name: 'wechat'
wechat_configs:
- corp_id: ww4c893118fbf4d07c
to_party: 1
agent_id: 1000004
api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
send_resolved: true
- name: 'wechat-k8s-worknode'
wechat_configs:
- corp_id: ww4c893118fbf4d07c
to_party: 2
agent_id: 1000004
api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
send_resolved: true