第十一周

一、kubernetes 二进制部署的prometheus实现服务发现

  • kubernetes集群外部署prometheus
# 创建用户并授权
[root@deploy-1 case]# pwd
/k8s-data/yaml/prometheus-case-files/prometheus-files/case

[root@deploy-1 case]# kubectl apply -f case4-prom-rbac.yaml

#查看secrets  
[root@deploy-1 case]# kubectl get secrets  -n monitoring 
NAME               TYPE                                  DATA   AGE
monitoring-token   kubernetes.io/service-account-token   3      32s

#获取token
[root@deploy-1 case]# kubectl describe  secrets -n monitoring monitoring-token

#将token放在prometheus服务器上的某个文件
[root@prometheus ~]# vim /usr/local/prometheus/k8s.token
  • API Serevr 节点发现
  - job_name: 'kubernetes-apiservers-monitor'
    kubernetes_sd_configs:
    - role: endpoints
      api_server: https://10.10.20.17:6443
      tls_config:
        insecure_skip_verify: true
      bearer_token_file: /usr/local/prometheus/k8s.token
    scheme: https
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /usr/local/prometheus/k8s.token
    relabel_configs:

    - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
      action: keep
      regex: default;kubernetes;https
    - source_labels: [__address__]
      regex: '(.*):6443'
      replacement: '${1}:9100'
      target_label: __address__
      action: replace
    - source_labels: [__scheme__]
      regex: https
      replacement: http
      target_label: __scheme__
      action: replace
  • node 节点发现
  - job_name: 'kubernetes-nodes-monitor'
    scheme: http
    tls_config:
      insecure_skip_verify: true
    bearer_token_file: /usr/local/prometheus/k8s.token
    kubernetes_sd_configs:
    - role: node
      api_server: https://10.10.20.17:6443
      tls_config:
        insecure_skip_verify: true
      bearer_token_file: /usr/local/prometheus/k8s.token
    relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__
        action: replace
      - source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
        regex: '(.*)'
        replacement: '${1}'
        action: replace
        target_label: LOC
      - source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
        regex: '(.*)'
        replacement: 'NODE'
        action: replace
        target_label: Type
      - source_labels: [__meta_kubernetes_node_label_failure_domain_beta_kubernetes_io_region]
        regex: '(.*)'
        replacement: 'K8S-test'
        action: replace
        target_label: Env
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
  • 指定 namespace 的 pod
  - job_name: 'kubernetes-发现指定 namespace 的所有 pod'
    kubernetes_sd_configs:
    - role: pod
      api_server: https://10.10.20.17:6443
      tls_config:
        insecure_skip_verify: true
      bearer_token_file: /usr/local/prometheus/k8s.token
      namespaces:
        names:
        - myserver
        - magedu
    relabel_configs:
    - action: labelmap
      regex: __meta_kubernetes_pod_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_pod_name]
      action: replace
      target_label: kubernetes_pod_name
  • 指定 Pod 发现条件
  - job_name: 'kubernetes-指定发现条件的 pod'
    kubernetes_sd_configs:
    - role: pod
      api_server: https://10.10.20.17:6443
      tls_config:
        insecure_skip_verify: true
      bearer_token_file: /usr/local/prometheus/k8s.token
    relabel_configs:
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
      action: keep
      regex: true
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
      action: replace
      target_label: __metrics_path__
      regex: (.+)
    - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
      action: replace
      regex: ([^:]+)(?::\d+)?;(\d+)
      replacement: $1:$2
      target_label: __address__
    - action: labelmap
      regex: __meta_kubernetes_pod_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_pod_name]
      action: replace
      target_label: kubernetes_pod_name
    - source_labels: [__meta_kubernetes_pod_label_pod_template_hash]
      regex: '(.*)'
      replacement: 'K8S-test'
      action: replace
      target_label: Env
image.png

image.png

image.png

二、prometheus 基于consul、file实现服务发现

  • consul发现
wget https://releases.hashicorp.com/consul/1.15.2/consul_1.15.2_linux_amd64.zip

unzip consul_1.15.2_linux_amd64.zip

mv consul /usr/local/bin/

mkdir /data/consul 

vim /etc/systemd/system/consul.service
[Unit]
Description=consul
After=network.target

[Service]
ExecStart=/usr/local/bin/consul agent -server -bind=0.0.0.0 -dev -client 0.0.0.0 -ui -data-dir=/data/consul -advertise=10.10.20.8

[Install]
WantedBy=multi-user.target

systemctl  daemon-reload

systemctl  start consul

systemctl  enable consul

#注册
curl -X PUT -d '{"id": "master1","name": "master1","address": "10.10.20.17","port":9100,"tags": ["master1"],"checks": [{"http": "http://10.10.20.17:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

curl -X PUT -d '{"id": "master3","name": "master3","address": "10.10.20.6","port":9100,"tags": ["master3"],"checks": [{"http": "http://10.10.20.6:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

curl -X PUT -d '{"id": "master2","name": "master2","address": "10.10.20.19","port":9100,"tags": ["master2"],"checks": [{"http": "http://10.10.20.19:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

curl -X PUT -d '{"id": "node1","name": "node1","address": "10.10.20.14","port":9100,"tags": ["node1"],"checks": [{"http": "http://10.10.20.14:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

curl -X PUT -d '{"id": "node3","name": "node3","address": "10.10.20.12","port":9100,"tags": ["node3"],"checks": [{"http": "http://10.10.20.12:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

curl -X PUT -d '{"id": "node2","name": "node2","address": "10.10.20.15","port":9100,"tags": ["node2"],"checks": [{"http": "http://10.10.20.15:9100/","interval": "5s"}]}' http://10.10.20.8:8500/v1/agent/service/register

#修改prometheus配置文件
vim prometheus.yml
  - job_name: consul
    honor_labels: true
    metrics_path: /metrics
    scheme: http
    consul_sd_configs:
      - server: 10.10.20.8:8500
        services: [] #发现的目标服务名称,空为所有服务,可以写 servicea,servcieb,servicec
      - server: 10.10.20.8:8500
        services: []
      - server: 10.10.20.8:8500
        services: []
      - refresh_interval: 5s  #每5秒钟检查一次(默认30秒)
    relabel_configs:
    - source_labels: ['__meta_consul_tags']
      target_label: 'product'
    - source_labels: ['__meta_consul_dc']
      target_label: 'idc'
    - source_labels: ['__meta_consul_service']  #过滤consul本身
      regex: "consul"
      action: drop

systemctl  restart  prometheus
image.png
image.png
  • file实现服务发现
# 准备json文件
local_node.json 
[
    {
        "targets": [ "localhost:9100" ],
        "labels": {
          "job": "local-node",
          "service": "localhost"
        }
    }
]

# prometheus配置规则
- job_name: "prometheus"
    file_sd_configs:
    - files:
      - "/usr/local/prometheus/local_node.json"

systemctl  restart  prometheus
image.png

三、prometheus 监控案例-kube-state-metrics

-kube-state-metrics:通过监听API server生成有关资源对象的状态指标,比如Deployment、Node、Pod,需要注意的是kube-state-metrics使用场景不是用于监控对方是否存活,而是周期性获取目标对象metrics指标数据并在web界面进行显示或被prometheus抓取(如pod状态是running还是terminating、pod的创建时间等),目前kube-state-metrics收集的指标数据可参见官方文档
编写yaml文件,部署kube-state-metrics

apiVersion: apps/v1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: kube-state-metrics
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        image: registry.cn-hangzhou.aliyuncs.com/zhangshijie/kube-state-metrics:v2.6.0 
        ports:
        - containerPort: 8080

---
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: kube-state-metrics
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources: ["daemonsets", "deployments", "replicasets"]
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources: ["statefulsets"]
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources: ["cronjobs", "jobs"]
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources: ["horizontalpodautoscalers"]
  verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: kube-system

---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: kube-state-metrics
  namespace: kube-system
  labels:
    app: kube-state-metrics
spec:
  type: NodePort
  ports:
  - name: kube-state-metrics
    port: 8080
    targetPort: 8080
    nodePort: 31666
    protocol: TCP
  selector:
    app: kube-state-metrics

kubectl apply -f kube-state-metrics-deploy.yaml

[root@deploy-1 case]# kubectl get pod -A | grep state
kube-system            kube-state-metrics-7c4b576569-xk4nz              1/1     Running     0            97m

[root@deploy-1 case]# kubectl get svc  -A| grep state
kube-system            kube-state-metrics                   NodePort    10.100.203.178   <none>        8080:31666/TCP                 98m

配置prometheus采集数据
  - job_name: 'kube-state-metrics'
    static_configs:
      - targets: ["10.10.20.12:31666"]  

systemctl  restart  prometheus
image.png

image.png

四、prometheus 监控案例-Tomcat、Redis、Mysql、Haproxy、Nginx

  • 监控Tomcat
#下载jar/war包 https://repo1.maven.org/maven2/io/prometheus/
TOMCAT_SIMPLECLIENT_VERSION=0.8.0
TOMCAT_EXPORTER_VERSION=0.0.12
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_hotspot/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_hotspot-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/io/prometheus/simpleclient_servlet_common/${TOMCAT_SIMPLECLIENT_VERSION}/simpleclient_servlet_common-${TOMCAT_SIMPLECLIENT_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_client/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_client-${TOMCAT_EXPORTER_VERSION}.jar
curl -O https://repo1.maven.org/maven2/nl/nlighten/tomcat_exporter_servlet/${TOMCAT_EXPORTER_VERSION}/tomcat_exporter_servlet-${TOMCAT_EXPORTER_VERSION}.war

#编写Dockerfile  
FROM tomcat:8.5.73
ADD server.xml /usr/local/tomcat/conf/server.xml 
RUN mkdir /data/tomcat/webapps -p
ADD myapp /data/tomcat/webapps/myapp
ADD metrics.war /data/tomcat/webapps 
ADD simpleclient-0.8.0.jar  /usr/local/tomcat/lib/
ADD simpleclient_common-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_hotspot-0.8.0.jar /usr/local/tomcat/lib/
ADD simpleclient_servlet-0.8.0.jar /usr/local/tomcat/lib/
ADD tomcat_exporter_client-0.0.12.jar /usr/local/tomcat/lib/
EXPOSE 8080 8443 8009

#构建镜像
docker  build -t qj.harbor.com/prometheus/tomcat-app1:v1 .
docker  push qj.harbor.com/prometheus/tomcat-app1:v1

#编写tomcat的yaml文件
apiVersion: apps/v1
kind: Deployment
metadata:
  name: tomcat-deployment
  namespace: default
spec:
  selector:
    matchLabels:
     app: tomcat
  replicas: 1 # tells deployment to run 2 pods matching the template
  template: # create pods using pod definition in this template
    metadata:
      labels:
        app: tomcat
      annotations: #添加注解,允许prometheus收集数据
        prometheus.io/scrape: 'true'
    spec:
      containers:
      - name: tomcat
        image: qj.harbor.com/prometheus/tomcat-app1:v1
        imagePullPolicy: Always
        ports:
        - containerPort: 8080
        securityContext:
          privileged: true


[root@deploy-1 yaml]#  kubectl apply -f tomcat-deploy.yaml 

[root@deploy-1 yaml]# kubectl get pod 
NAME                                 READY   STATUS        RESTARTS      AGE
tomcat-deployment-555c8594cc-b7hzg   1/1     Running       0             8m33s

[root@deploy-1 yaml]# kubectl get svc
NAME             TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)        AGE
kubernetes       ClusterIP   10.100.0.1     <none>        443/TCP        28d
tomcat-service   NodePort    10.100.26.30   <none>        80:31080/TCP   93s

[root@deploy-1 yaml]# curl http://172.16.100.64:31080/myapp/
<h1>tomcat app1</h1>

#prometheus 采集并验证数据
[root@prometheus prometheus]# vim prometheus.yml
 - job_name: "tomcat-monitor-metrics"
    static_configs:
      - targets: ["172.16.100.64:31080"]

[root@prometheus prometheus]# systemctl  restart  prometheus

grafana 导入模板:
https://github.com/nlighten/tomcat_exporter
https://github.com/nlighten/tomcat_exporter/tree/master/dashboard
image.png

image.png
  • 监控Redis
#编写yaml文件,部署redis
apiVersion: apps/v1
kind: Deployment
metadata:
  name: redis
  namespace: studylinux-net
spec:
  replicas: 1
  selector:
    matchLabels:
      app: redis
  template:
    metadata:
      labels:
        app: redis
    spec:
      containers:
      - name: redis
        image: redis:4.0.14
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
        ports:
        - containerPort: 6379
      - name: redis-exporter
        image: oliver006/redis_exporter:latest
        resources:
          requests:
            cpu: 100m
            memory: 100Mi

[root@deploy-1 yaml]# kubectl apply  -f redis-deployment.yaml 

[root@deploy-1 yaml]# kubectl apply  -f redis-exporter-svc.yaml    #redis-exporter-svc

[root@deploy-1 yaml]# kubectl apply  -f redis-redis-svc.yaml    # redis-server-svc

[root@deploy-1 yaml]# kubectl get pod -n studylinux-net 
NAME                     READY   STATUS    RESTARTS   AGE
redis-5b767677fc-74bw5   2/2     Running   0          51s

[root@deploy-1 yaml]# kubectl get svc -n studylinux-net 
NAME                     TYPE       CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE
redis-exporter-service   NodePort   10.100.51.70    <none>        9121:31082/TCP   49s
redis-redis-service      NodePort   10.100.253.21   <none>        6379:31081/TCP   45s

prometheus 采集数据:
  - job_name: "redis-monitor-metrics"
    static_configs:
      - targets: ["172.16.100.64:31082"]

[root@prometheus prometheus]# systemctl  restart  prometheus
image.png
image.png
image.png
  • 监控Mysql
#安装mysql
[root@harproxy ~]# yum -y install mariadb mariadb-server

#创建授权用户
[root@harproxy ~]# mysql
MariaDB [(none)]> CREATE USER 'mysql_exporter'@'localhost' IDENTIFIED BY '123456';
MariaDB [(none)]>  GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysql_exporter'@'localhost';

#准备 mysqld_exporter 环境:
[root@harproxy src]# wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.14.0/mysqld_exporter-0.14.0.liunx-amd64.tar.gz

[root@harproxy src]# tar -xf mysqld_exporter-0.14.0.linux-amd64.tar.gz

[root@harproxy src]#  mv mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter

[root@harproxy ~]# vim   /usr/local/mysqld_exporter/my.conf
[client]
user=mysql_exporter
password=123456

[root@harproxy ~]#  vim /etc/systemd/system/mysqld_exporter.service
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/usr/local/mysqld_exporter/mysqld_exporter --config.my-cnf=/usr/local/mysqld_exporter/.my.cnf
[Install]
WantedBy=multi-user.target

[root@harproxy ~]# systemctl daemon-reload 

[root@harproxy ~]# systemctl start mysqld_exporter.service

#prometheus 采集数据:
vim prometheus.yml 
 - job_name: "mysql-monitor-metrics"
    static_configs:
      - targets: ["172.16.100.80:9104"]

systemctl restart prometheus.service
image.png
image.png
image.png
  • 监控Haproxy
#安装Haproxy
[root@harproxy ~]# yum -y install haproxy

#修改haproxy配置,设置sock ,开启状态页
[root@harproxy ~]# vim /etc/haproxy/haproxy.cfg
global
    log         127.0.0.1 local0
    log         127.0.0.1 local1 notice 
    chroot      /var/lib/haproxy
    pidfile     /var/run/haproxy.pid
    stats socket /var/lib/haproxy/stats.sock  mode 660 level admin 
    stats timeout 30s
    user        haproxy
    group       haproxy
    daemon
defaults
    mode                    http
    log                     global
    #option                  httplog
    option                  dontlognull
    option http-server-close
    option                  redispatch
    retries                 3
    timeout http-request    10s
    timeout queue           1m
    timeout connect         10s
    timeout client          1m
    timeout server          1m
    timeout http-keep-alive 10s
    timeout check           10s
    maxconn                 3000


listen stats
    bind :9999
    stats enable
    stats uri /haproxy-status
    stats auth admin:123456

listen ingress-80
    bind  10.10.20.10:80
        mode  tcp
        server 10.10.20.14 10.10.20.14:58561 check inter 3s fall 3 rise 3
        server 10.10.20.15 10.10.20.15:58561 check inter 3s fall 3 rise 3
        server 10.10.20.12 10.10.20.12:58561 check inter 3s fall 3 rise 3
listen ingress-443
        bind  10.10.20.10:443
        mode  tcp
        server 10.10.20.14 10.10.20.14:39257 check inter 3s fall 3 rise 3
        server 10.10.20.15 10.10.20.15:39257 check inter 3s fall 3 rise 3
        server 10.10.20.12 10.10.20.12:39257 check inter 3s fall 3 rise 3


[root@harproxy ~]# systemctl  start haproxy

[root@harproxy ~]# systemctl  enable haproxy
#安装haproxy_exporter
[root@harproxy ~]# wget https://github.com/prometheus/haproxy_exporter/releases/download/v0.15.0/haproxy_exporter-0.15.0.linux-amd64.tar.gz

[root@harproxy ~]# tar -xf haproxy_exporter-0.15.0.linux-amd64.tar.gz

[root@harproxy ~]# mv  haproxy_exporter-0.15.0.linux-amd64 /usr/local/haproxy_exporter

[root@harproxy ~]# vim /etc/systemd/system/haproxy_exporter.service 
[Unit]
Description=Prometheus Haproxy_exporter
After=network.target

[Service]
#sock方式启动
ExecStart=/usr/local/haproxy_exporter/haproxy_exporter --haproxy.scrape-uri=unix:/run/haproxy/haproxy.sock

#状态页方式启动
#ExecStart=/usr/local/haproxy_exporter/haproxy_exporter --haproxy.scrape-uri=http://admin:123456@127.0.0.1:9999/haproxy-status;csv   

[Install]
WantedBy=multi-user.target

[root@harproxy ~]# systemctl  daemon-reload 

[root@harproxy ~]# systemctl  start haproxy_exporter

[root@harproxy ~]# systemctl  enable haproxy_exporter
#prometheus 添加 job:
- job_name: "haproxy-monitor-metrics"
    static_configs:
      - targets: ["172.16.100.80:9101"]
image.png

image.png

image.png

image.png
  • 监控Nginx
#编译安装 nginx:
[root@harproxy ~]#  git clone https://github.com/vozlt/nginx-module-vts.git

[root@harproxy ~]# wget http://nginx.org/download/nginx-1.20.2.tar.gz

[root@harproxy ~]# tar -xf nginx-1.20.2.tar.gz

[root@harproxy ~]# cd nginx-1.20.2/

[root@harproxy nginx-1.20.2]# ./configure --prefix=/usr/local/nginx --with-http_ssl_module --with-http_v2_module  --with-http_realip_module --with-http_stub_status_module  --with-http_gzip_static_module  --with-pcre --with-file-aio  --with-stream --with-stream_ssl_module --with-stream_realip_module --add-module=/root/nginx-module-vts-master/

[root@harproxy nginx-1.20.2]#  make && make install

[root@harproxy nginx-1.20.2]# vim /usr/local/nginx/conf/nginx.conf
#http 范围配置: 
33     vhost_traffic_status_zone;    ##启用状态页

#server 字段配置:
 39         location / {
 40             root   html;
 41             index  index.html index.htm;
 42             proxy_pass http://10.10.20.8:9090;  #返现代理 prometheus
 43         }

 45         location /status {
 46             vhost_traffic_status_display;
 47             vhost_traffic_status_display_format html;
 48         }

[root@harproxy nginx-1.20.2]# /usr/local/nginx/sbin/nginx  -t

[root@harproxy nginx-1.20.2]# /usr/local/nginx/sbin/nginx 
#安装 nginx exporter
[root@harproxy ~]# wget https://github.com/hnlq715/nginx-vts-exporter/releases/download/v0.10.3/nginx-vts-exporter-0.10.3.linux-amd64.tar.gz

[root@harproxy ~]# tar -xf nginx-vts-exporter-0.10.3.linux-amd64.tar.gz

[root@harproxy ~]#mv nginx-vts-exporter-0.10.3.linux-amd64 /usr/local/nginx_exporter

[root@harproxy ~]# vim /etc/systemd/system/nginx_exporter.service
[Unit]
Description=nginx-exporter
After=network.target

[Service]
ExecStart=/usr/local/nginx_exporter/nginx-vts-exporter -nginx.scrape_uri http://10.10.20.10/status/format/json

[Install]
WantedBy=multi-user.target

[root@harproxy ~]# systemctl daemon-reload

[root@harproxy ~]#systemctl start nginx_exporter.service 

[root@harproxy ~]#systemctl enbles nginx_exporter.service 
[root@prometheus prometheus]#  vim prometheus.yml
- job_name: "nginx-monitor-metrics"
    static_configs:
      - targets: ["172.16.100.80:9913"]

[root@prometheus prometheus]# systemctl  restart  prometheus

# grafana导入模版2949
image.png
image.png
image.png

五、基于blackbox_exporter实现对URL状态、IP可用性、端口状态、TLS证书的过期时间监控

blackbox_exporter是prometheus官方提供的一个exporter,可以监控HTTP、HTTPS、DNS、TCP、ICMP等目标实例,实现对被监控节点进行监控和数据采集。

  • 部署blackbox_exporter
[root@prometheus ~]# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.23.0/blackbox_exporter-0.23.0.linux-amd64.tar.gz

[root@prometheus ~]# tar xvf blackbox_exporter-0.23.0.linux-amd64.tar.gz

[root@prometheus ~]# ln -s /blackbox_exporter-0.23.0.linux-amd64 /usr/local/blackbox_exporter

[root@prometheus ~]# vim /etc/systemd/system/blackbox_exporter.service
[Unit]
Description=blackbox_exporter
Documentation=https://prometheus.io/
After=network.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/usr/local/blackbox_exporter/blackbox_exporter --config.file=/usr/local/blackbox_exporter/blackbox.yml --web.listen-address=:9115
Restart=on-failure

[Install]
WantedBy=multi-user.target

[root@prometheus ~]# systemctl  daemon-reload 

[root@prometheus ~]# systemctl  start blackbox_exporter

[root@prometheus ~]# systemctl  enable blackbox_exporter
  • URL状态和TLS证书过期时间监控
 - job_name: 'http_status'
    metrics_path: /probe
    params:
      module: [http_2xx]
    static_configs:
      - targets: ['http://www.xiaomi.com', 'http://www.magedu.com']
        labels:
          instance: http_status
          group: web
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target] #基于__param_target 获取监控目标
        target_label: url #将监控目标的值与 url 创建一个 label
      - target_label: __address__
        replacement: 127.0.0.1:9115
  • IP可用性监控
- job_name: 'ping_status'
    metrics_path: /probe
    params:
      module: [icmp]
    static_configs:
      - targets: ['10.10.20.12',"10.10.20.14"]
        labels:
          instance: 'ping_status'
          group: 'icmp'
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: ip
      - target_label: __address__
        replacement: 127.0.0.1:9115
  • 端口状态监控
 - job_name: 'port_status'
    metrics_path: /probe
    params:
      module: [tcp_connect]
    static_configs:
      - targets: ['10.10.20.10:80', '10.10.20.10:5601','10.10.20.10:3306']
        labels:
          instance: 'port_status'
          group: 'port'
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: ip
      - target_label: __address__
        replacement: 127.0.0.1:9115
[root@prometheus prometheus]# systemctl  restart  prometheus
image.png

image.png

image.png

六、prometheus 结合钉钉实现告警通知、企业微信实现告警通知、告警模板的使用、告警分类发送

  • Alertmanager
    prometheus-->触发阈值-->超出持续时间-->alertmanager-->分组|抑制|静默-->媒体类型-->邮件|钉钉|微信等

分组:将类似性质的警告合并为单个通知,如网络通知、主机通知、服务通知

静默:是一种简单的特定时间静音的机制,如服务器要升级维护可以先设置这个时间段告警静默

抑制:当警告发出后,停止重复发送由此警告引发的其他警告即合并为一个故障引起的多个报警事件,可以消除冗余告警

prometheus 结合钉钉实现告警通知

  • 部署alertmanager
[root@prometheus src]# wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz

[root@prometheus src]# tar -xf alertmanager-0.24.0.linux-amd64.tar.gz

[root@prometheus src]# mv alertmanager-0.24.0.linux-amd64 /usr/local/alertmanager

[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target

[Service]
Restart=on-failure
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml  --cluster.advertise-address=0.0.0.0:9093

[root@prometheus src]# systemctl  daemon-reload 

[root@prometheus src]# systemctl  start alertmanager

[root@prometheus src]# systemctl  enable alertmanager
  • 部署prometheus-webhook-dingtalk
[root@prometheus src]# wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz

[root@prometheus src]# tar -xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz

[root@prometheus src]# mv prometheus-webhook-dingtalk-2.1.0.linux-amd64 /usr/local/prometheus-webhook-dingtalk

[root@prometheus src]#  cd /usr/local/prometheus-webhook-dingtalk

[root@prometheus prometheus-webhook-dingtalk]# cp config.example.yml config.yml

[root@prometheus prometheus-webhook-dingtalk]# vim config.yml
timeout: 5s
templates:
  - contrib/templates/legacy/template.tmpl

targets:
  webhook1:
    url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b   #钉钉群机器人id
    secret: SEC000000000000000000000
  webhook_legacy:
    url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b   #钉钉群机器人id
    message:
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'
  webhook_mention_all:
    url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b    #钉钉群机器人id
    mention:
      all: true
  webhook_mention_users:
    url: https://oapi.dingtalk.com/robot/send?access_token=93bc0401a58899e18d665ad1c7885f272df73003302ecc68176a6f358c3e162b     #钉钉群机器人id
    mention:
      mobiles: ['13816457159']

[root@prometheus prometheus-webhook-dingtalk]# vim /etc/systemd/system/prometheus-webhook.service           
[Unit]
Description=https://github.com/timonwong/prometheus-webhook-dingtalk/releases/
After=network-online.target
[Service]
Restart=on-failure
ExecStart=/usr/local/prometheus-webhook/prometheus-webhook-dingtalk --config.file=/usr/local/prometheus-webhook/config.yml

[Install]
WantedBy=multi-user.target

[root@prometheus prometheus-webhook-dingtalk]# systemctl daemon-reload

[root@prometheus prometheus-webhook-dingtalk]# systemctl  start  prometheus-webhook.service

[root@prometheus prometheus-webhook-dingtalk]# systemctl enable prometheus-webhook.service  
  • 配置prometheus告警规则
[root@prometheus prometheus]# vim prometheus.yml
alerting:
  alertmanagers:
    - static_configs:
        - targets:
           - 127.0.0.1:9093
rule_files:
   - "first_rules.yml"

[root@prometheus prometheus]# cat first_rules.yml 
groups:
- name: Linux
  rules:
  - alert: "内存报警"
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 >85
    for: 60s
    labels:
        severity: warning
    annotations:
        description: "{{ $labels.instance }} 内存资源利用率大于 85%"
        value: "当前内存占用率{{ $value }}%"
        summary: "{{ $labels.job }}"
      
  - alert: "CPU报警"
    expr: 100 * (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[2m])) by(instance)) > 90
    for: 60s
    labels:
        severity: warning
    annotations:
        description: "{{ $labels.instance }} CPU报警资源利用率大于 85%"
        value: "当前CPU占用率{{ $value }}%"
        summary: "{{ $labels.job }}"
      
  - alert: "磁盘报警"
    expr: 100 * (node_filesystem_size_bytes{fstype=~"xfs|ext4"} - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 90
    for: 60s
    labels:
        severity: warning
    annotations:
        description: "磁盘报警资源利用率大于 90%,请及时扩容!"
        value: "当前磁盘占用率{{ $value }}%"
        summary: "{{ $labels.job }}"
[root@prometheus prometheus]# systemctl  restart  prometheus
  • alertmanager设置钉钉告警
[root@prometheus alertmanager]# cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '530387590@qq.com'
  smtp_auth_username: '530387590@qq.com'
  smtp_auth_password: 'isvfwhvzrkugbgjc'
  smtp_require_tls: false

route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'dingding'

receivers:
- name: dingding
  webhook_configs:
  - url: 'http://127.0.0.1:8060/dingtalk/webhook1/send' 

inhibit_rules: 
  - source_match:
      severity: 'critical' 
    target_match:
      severity: 'warning'  
    equal: ['alertname', 'dev', 'instance'] 
[root@prometheus alertmanager]# systemctl  restart alertmanager.service
image.png
image.png
image.png

prometheus企业微信实现告警通知

cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '530387590@qq.com'
  smtp_auth_username: '530387590@qq.com'
  smtp_auth_password: 'isvfwhvzrkugbgjc'
  smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 10s
group_interval: 10s
repeat_interval: 10m
receiver: wechat
receivers:
-- name: 'wechat' 
  wechat_configs: 
  - corp_id: ww1fdca58c3d0b1749
 to_party: 1
agent_id: 1000004
api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
send_resolved: true
image.png

prometheus告警模板的使用

[root@prometheus alertmanager]# cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '530387590@qq.com'
  smtp_auth_username: '530387590@qq.com'
  smtp_auth_password: 'isvfwhvzrkugbgjc'
  smtp_require_tls: false

templates: 
- '/usr/local/alertmanager/template.templ' #alertermanager 引用模板

route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'dingding'

receivers:
- name: dingding
  webhook_configs:
  - url: 'http://127.0.0.1:8060/dingtalk/webhook1/send' 

inhibit_rules: 
  - source_match:
      severity: 'critical' 
    target_match:
      severity: 'warning'  
    equal: ['alertname', 'dev', 'instance'] 

告警分类发送

  • prometheus rules 配置
[root@prometheus prometheus]# cat /usr/local/prometheus/local_linux_local.yml 
groups:
- name: localhost-rules
  rules:
  - alert: "服务器宕机报警"
    expr: (up{service="localhost"} == 0) * on(instance) group_left(nodename) (node_uname_info)
    for: 3m
    labels:
        severity: critical
        service: localhost
    annotations:
        description: "服务器已宕机!"

  - alert: "内存报警"
    expr: (100 - node_memory_MemAvailable_bytes{service="localhost"} / node_memory_MemTotal_bytes{service="localhost"} * 100)* on(instance) group_left(nodename) (node_uname_info) > 90
    for: 60s
    labels:
        severity: warning
        service: localhost
    annotations:
        description: "内存资源利用率大于 90%"
        value: "当前内存占用率{{ $value }}%"
      
  - alert: "CPU报警"
    expr: 100 * (1 - avg(irate(node_cpu_seconds_total{mode="idle",service="localhost"}[2m])) by (instance)) * on(instance) group_left(nodename) (node_uname_info) > 90
    for: 60s
    labels:
        severity: warning
        service: localhost
    annotations:
        description: "CPU报警资源利用率大于90%"
        value: "当前CPU占用率{{ $value }}%"
      
  - alert: "磁盘报警"
    expr: 100 * (node_filesystem_size_bytes{fstype=~"xfs|ext4",service="localhost"} - node_filesystem_avail_bytes{service="localhost"}) / node_filesystem_size_bytes{service="localhost"} * on(instance) group_left(nodename) (node_uname_info) > 90
    for: 60s
    labels:
        severity: warning
        service: localhost
    annotations:
        description: "磁盘报警资源利用率大于 90%,请及时扩容!"
        value: "当前磁盘占用率{{ $value }}%"

  • alertermanager 配置:
[root@prometheus prometheus]# cat /usr/local/alertmanager/alertmanager.yml
[root@prometheus alertmanager]# cat alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.qq.com:25'
  smtp_from: '530387590@qq.com'
  smtp_auth_username: '530387590@qq.com'
  smtp_auth_password: 'isvfwhvzrkugbgjc'
  smtp_require_tls: false
route: 
  group_by: [alertname]
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 10m
  receiver: 'dingding' #默认告警方式为钉钉

#添加消息路由
routes: 
- receiver: 'default-receiver'  #critical 级别的通过邮件发送给 leader
   group_wait: 10s
   match_re:
   severity: critical #匹配严重等级告警
- receiver: 'wechat-k8s-worknode' #宿主机告警通过企业微信发送给监控组
group_wait: 10s
match_re:
service: localhost  #匹配本机告警

receivers: 
  - name: 'default-receiver' 
    email_configs:
    - to: '13816457159@163.com'
      send_resolved: true
  - name: dingding
    webhook_configs:
    - url: 'http://172.31.2.120:8060/dingtalk/alertname/send' 
      send_resolved: true
  - name: 'wechat' 
    wechat_configs: 
    - corp_id: ww4c893118fbf4d07c
      to_party: 1
     agent_id: 1000004
     api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
     send_resolved: true
- name: 'wechat-k8s-worknode' 
  wechat_configs: 
  - corp_id: ww4c893118fbf4d07c
   to_party: 2
  agent_id: 1000004
  api_secret: Cay2S4ARSotKiznJ4L8umka-j6ec9i-ZP0igNCwEwxM
  send_resolved: true

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 218,122评论 6 505
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 93,070评论 3 395
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 164,491评论 0 354
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 58,636评论 1 293
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 67,676评论 6 392
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 51,541评论 1 305
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 40,292评论 3 418
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 39,211评论 0 276
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 45,655评论 1 314
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,846评论 3 336
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,965评论 1 348
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 35,684评论 5 347
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 41,295评论 3 329
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,894评论 0 22
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 33,012评论 1 269
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 48,126评论 3 370
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,914评论 2 355

推荐阅读更多精彩内容