环境信息
本次环境信息如下
Kubernetes 1.24.3
openebs openebs.io/version: 3.3.0

安装openebs

kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml

[root@CentOS8 prometheus]# kubectl get sc
NAME               PROVISIONER        RECLAIMPOLICY   VOLUMEBINDINGMODE      ALLOWVOLUMEEXPANSION   AGE
openebs-device     openebs.io/local   Delete          WaitForFirstConsumer   false                  150m
openebs-hostpath   openebs.io/local   Delete          WaitForFirstConsumer   false                  150m

[root@CentOS8 prometheus]# kubectl get all -n openebs
NAME                                                READY   STATUS    RESTARTS   AGE
pod/openebs-localpv-provisioner-dd977fdd5-7h94d     1/1     Running   0          152m
pod/openebs-ndm-85v85                               1/1     Running   0          152m
pod/openebs-ndm-cluster-exporter-866c974856-sgksr   1/1     Running   0          152m
pod/openebs-ndm-node-exporter-hxp2q                 1/1     Running   0          152m
pod/openebs-ndm-operator-85886744bb-pbt9p           1/1     Running   0          152m

NAME                                           TYPE        CLUSTER-IP   EXTERNAL-IP   PORT(S)    AGE
service/openebs-ndm-cluster-exporter-service   ClusterIP   None         <none>        9100/TCP   152m
service/openebs-ndm-node-exporter-service      ClusterIP   None         <none>        9101/TCP   152m

NAME                                       DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
daemonset.apps/openebs-ndm                 1         1         1       1            1           <none>          152m
daemonset.apps/openebs-ndm-node-exporter   1         1         1       1            1           <none>          152m

NAME                                           READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/openebs-localpv-provisioner    1/1     1            1           152m
deployment.apps/openebs-ndm-cluster-exporter   1/1     1            1           152m
deployment.apps/openebs-ndm-operator           1/1     1            1           152m

NAME                                                      DESIRED   CURRENT   READY   AGE
replicaset.apps/openebs-localpv-provisioner-dd977fdd5     1         1         1       152m
replicaset.apps/openebs-ndm-cluster-exporter-866c974856   1         1         1       152m
replicaset.apps/openebs-ndm-operator-85886744bb           1         1         1       152m


创建PVC持久化目录

[root@CentOS8 prometheus]# kubectl create ns prometheus
namespace/prometheus created

[root@CentOS8 prometheus]# cat prometheus-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: prometheus-data-db       #pvc名称，这里不建议修改
  namespace: prometheus
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi                            #创建pvc大小
  storageClassName: openebs-hostpath        #这里是我的storageclass，请根据自己的实际情况修改
  
  
#创建pvc
[root@CentOS8 prometheus]# kubectl apply -f prometheus-pvc.yaml
persistentvolumeclaim/prometheus-data-db created

#查看pvc创建情况
[root@CentOS8 prometheus]# kubectl get pvc -n prometheus
NAME                 STATUS   VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS       AGE
prometheus-data-db   Bound    pvc-fad4f75b-b103-4a27-bd73-a5adb64f7308   50Gi       RWO            openebs-hostpath   142m

安装Prometheus

首先需要配置configmap

[root@CentOS8 prometheus]# cat prometheus.configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: prometheus
data:
  prometheus.yml: |
    global:
      scrape_interval: 15s
      scrape_timeout: 15s
    scrape_configs:
    - job_name: 'prometheus'
      static_configs:
      - targets: ['localhost:9090']
    - job_name: 'kubernetes-node'
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__
        action: replace
    - job_name: 'kubernetes-cadvisor'
      kubernetes_sd_configs:
      - role: node
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::d+)?;(d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name
    - job_name: kubernetes-nodes-cadvisor
      scrape_interval: 10s
      scrape_timeout: 10s
      scheme: https  # remove if you want to scrape metrics on insecure port
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
        - role: node
      relabel_configs:
        - action: labelmap
          regex: __meta_kubernetes_node_label_(.+)
        - target_label: __address__
          replacement: kubernetes.default.svc:443
        - source_labels: [__meta_kubernetes_node_name]
          regex: (.+)
          target_label: __metrics_path__
          replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
      metric_relabel_configs:
        - action: replace
          source_labels: [id]
          regex: '^/machine.slice/machine-rkt\x2d([^\]+)\.+/([^/]+).service$'
          target_label: rkt_container_name
          replacement: '${2}-${1}'
        - action: replace
          source_labels: [id]
          regex: '^/system.slice/(.+).service$'
          target_label: systemd_service_name
          replacement: '${1}'
    - job_name: kube-state-metrics
      static_configs:
      - targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']

接下来配置rbac授权文件，否则Deployment会没有权限创建pod

[root@CentOS8 prometheus]# cat prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups:
  - ""
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs:
  - get
  - list
  - watch
- apiGroups:
  - ""
  resources:
  - configmaps
  - nodes/metrics
  verbs:
  - get
- nonResourceURLs:
  - /metrics
  verbs:
  - get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: prometheus

创建prometheus容器，以Deployment方式运行

[root@CentOS8 prometheus]# cat prometheus.deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: prometheus
  namespace: prometheus
  labels:
    app: prometheus
spec:
  selector:
    matchLabels:
    app: prometheus
  nodeSelector:
    node-role.kubernetes.io/master: ""
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      serviceAccountName: prometheus
      containers:
      # - image: prom/prometheus:latest #官方镜像地址
      - image: registry.cn-hangzhou.aliyuncs.com/urbancabin/prometheus:latest
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=30d"
        - "--web.enable-admin-api"  # 控制对admin HTTP API的访问，其中包括删除时间序列等功能
        - "--web.enable-lifecycle"  # 支持热更新，直接执行localhost:9090/-/reload立即生效
        ports:
        - containerPort: 9090
          protocol: TCP
          name: http
        volumeMounts:
        - mountPath: "/prometheus"
          subPath: prometheus
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        resources:
          requests:
            cpu: 100m
            memory: 512Mi
          limits:
            cpu: 100m
            memory: 512Mi
      securityContext:
        runAsUser: 0
      volumes:
        - name: data
          persistentVolumeClaim:
            claimName: prometheus-data-db
        - configMap:
          name: prometheus-config
          name: config-volume

接下来我们还需要给prometheus创建pvc（我这里后面grafana和prometheus会分开，所以我使用NodePort。如果有ingress可以使用cluster）

[root@CentOS8 prometheus]# cat prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: prometheus
  labels:
    app: prometheus
spec:
  selector:
    app: prometheus
  type: NodePort
  ports:
    - name: web
      port: 9090
      targetPort: http

接下来在给我们K8s节点设置node-exporter

[root@CentOS8 prometheus]# cat prometheus-node.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: node-exporter
  namespace: prometheus
  labels:
    name: node-exporter
spec:
  selector:
    matchLabels:
      name: node-exporter
  template:
    metadata:
      labels:
        name: node-exporter
    spec:
      hostPID: true
      hostIPC: true
      hostNetwork: true
      containers:
      - name: node-exporter
        #image: prom/node-exporter:v0.16.0 #官方地址
        image: registry.cn-hangzhou.aliyuncs.com/urbancabin/node-exporter:v0.16.0
        ports:
        - containerPort: 9100
        resources:
          requests:
            cpu: 0.15
        securityContext:
          privileged: true
        args:
        - --path.procfs
        - /host/proc
        - --path.sysfs
        - /host/sys
        - --collector.filesystem.ignored-mount-points
        - '"^/(sys|proc|dev|host|etc)($|/)"'
        volumeMounts:
        - name: dev
          mountPath: /host/dev
        - name: proc
          mountPath: /host/proc
        - name: sys
          mountPath: /host/sys
        - name: rootfs
          mountPath: /rootfs
      tolerations:
      - key: "node-role.kubernetes.io/master"
        operator: "Exists"
        effect: "NoSchedule"
      volumes:
        - name: proc
          hostPath:
            path: /proc
        - name: dev
          hostPath:
            path: /dev
        - name: sys
          hostPath:
            path: /sys
        - name: rootfs
          hostPath:
            path: /

当我们所以配置文件都设置好后，我们直接kubectl apply -f .即可

检查pod运行状态

[root@CentOS8 prometheus]# kubectl get all -n prometheus
NAME                                      READY   STATUS    RESTARTS       AGE
pod/grafana-7556f7d7df-nzkmt              1/1     Running   0              153m
pod/kube-state-metrics-5f8785787d-cw8ln   1/1     Running   0              170m
pod/node-exporter-pln47                   1/1     Running   0              18h
pod/prometheus-584b95655f-h4krl           1/1     Running   9 (4m6s ago)   3h32m

NAME                         TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)             AGE
service/grafana              NodePort    10.108.237.208   <none>        3000:30820/TCP      162m
service/kube-state-metrics   ClusterIP   10.96.242.162    <none>        8080/TCP,8081/TCP   170m
service/prometheus           NodePort    10.96.81.46      <none>        9090:32440/TCP      18h

NAME                           DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR   AGE
daemonset.apps/node-exporter   1         1         1       1            1           <none>          18h

NAME                                 READY   UP-TO-DATE   AVAILABLE   AGE
deployment.apps/grafana              1/1     1            1           162m
deployment.apps/kube-state-metrics   1/1     1            1           170m
deployment.apps/prometheus           1/1     1            1           18h

NAME                                            DESIRED   CURRENT   READY   AGE
replicaset.apps/grafana-7556f7d7df              1         1         1       162m
replicaset.apps/kube-state-metrics-5f8785787d   1         1         1       170m
replicaset.apps/prometheus-584b95655f           1         1         1       18h

访问prometheus测试

20220817123952.png

kube-state-metrics
Kube-State-Metrics简介
kube-state-metrics 通过监听 API Server 生成有关资源对象的状态指标，比如 Deployment 、Node 、 Pod ，需要注意的是 kube-state-metrics 只是简单的提供一个 metrics 数据，并不会存储这些指标数据，所以我们可以使用 Prometheus 来抓取这些数据然后存储，主要关注的是业务相关的一些元数据，比如 Deployment 、 Pod 、副本状态等；调度了多少个 replicas ？现在可用的有几个？多少个 Pod 是running/stopped/terminated 状态？ Pod 重启了多少次？我有多少 job 在运行中

yaml文件如下
本地命名空间为prometheus

[root@CentOS8 prometheus]# cat kube-metrics-deployment.yaml
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
rules:
- apiGroups:
  - ""
  resources:
  - configmaps
  - secrets
  - nodes
  - pods
  - services
  - resourcequotas
  - replicationcontrollers
  - limitranges
  - persistentvolumeclaims
  - persistentvolumes
  - namespaces
  - endpoints
  verbs:
  - list
  - watch
- apiGroups:
  - apps
  resources:
  - statefulsets
  - daemonsets
  - deployments
  - replicasets
  verbs:
  - list
  - watch
- apiGroups:
  - batch
  resources:
  - cronjobs
  - jobs
  verbs:
  - list
  - watch
- apiGroups:
  - autoscaling
  resources:
  - horizontalpodautoscalers
  verbs:
  - list
  - watch
- apiGroups:
  - authentication.k8s.io
  resources:
  - tokenreviews
  verbs:
  - create
- apiGroups:
  - authorization.k8s.io
  resources:
  - subjectaccessreviews
  verbs:
  - create
- apiGroups:
  - policy
  resources:
  - poddisruptionbudgets
  verbs:
  - list
  - watch
- apiGroups:
  - certificates.k8s.io
  resources:
  - certificatesigningrequests
  verbs:
  - list
  - watch
- apiGroups:
  - storage.k8s.io
  resources:
  - storageclasses
  - volumeattachments
  verbs:
  - list
  - watch
- apiGroups:
  - admissionregistration.k8s.io
  resources:
  - mutatingwebhookconfigurations
  - validatingwebhookconfigurations
  verbs:
  - list
  - watch
- apiGroups:
  - networking.k8s.io
  resources:
  - networkpolicies
  - ingresses
  verbs:
  - list
  - watch
- apiGroups:
  - coordination.k8s.io
  resources:
  - leases
  verbs:
  - list
  - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: prometheus
---
apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scraped: "true"    # 设置能被prometheus抓取到，因为不带这个annotation prometheus-service-endpoints 不会去抓这个metrics
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
spec:
#  clusterIP: None  # 允许通过svc来进行访问
  ports:
  - name: http-metrics
    port: 8080
    targetPort: http-metrics
  - name: telemetry
    port: 8081
    targetPort: telemetry
  selector:
    app.kubernetes.io/name: kube-state-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.4.2
  name: kube-state-metrics
  namespace: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/component: exporter
        app.kubernetes.io/name: kube-state-metrics
        app.kubernetes.io/version: 2.4.2
    spec:
      nodeName: centos8      # 设置在k8s-master-1上运行
      tolerations:                # 设置能容忍在master节点运行
      - key: "node-role.kubernetes.io/master"
        operator: "Exists"
        effect: "NoSchedule"
      automountServiceAccountToken: true
      containers:
#      - image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.2
      - image: anjia0532/google-containers.kube-state-metrics.kube-state-metrics:v2.4.2
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
        name: kube-state-metrics
        ports:
        - containerPort: 8080
          name: http-metrics
        - containerPort: 8081
          name: telemetry
        readinessProbe:
          httpGet:
            path: /
            port: 8081
          initialDelaySeconds: 5
          timeoutSeconds: 5
        securityContext:
          allowPrivilegeEscalation: false
          capabilities:
            drop:
            - ALL
          readOnlyRootFilesystem: true
          runAsUser: 65534
      serviceAccountName: kube-state-metrics

[root@CentOS8 prometheus]#  kubectl get pod -n prometheus
NAME                                  READY   STATUS    RESTARTS        AGE
grafana-7556f7d7df-nzkmt              1/1     Running   0               156m
kube-state-metrics-5f8785787d-cw8ln   1/1     Running   0               173m
node-exporter-pln47                   1/1     Running   0               18h
prometheus-584b95655f-h4krl           1/1     Running   9 (7m14s ago)   3h35m

检查svc是否有metric数据

[root@CentOS8 prometheus]# curl http://xxx:32440/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.5208e-05
go_gc_duration_seconds{quantile="0.25"} 0.000213094
go_gc_duration_seconds{quantile="0.5"} 0.094531627
go_gc_duration_seconds{quantile="0.75"} 0.105158749
go_gc_duration_seconds{quantile="1"} 0.303282049
go_gc_duration_seconds_sum 1.715240602
go_gc_duration_seconds_count 23
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 93
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.17.5"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
......

Grafana
接下来安装Grafana，同样也是基于原来的文章，只不过将Grafana做持久化存储

[root@CentOS8 prometheus]# cat grafana-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: grafana-data
  namespace: prometheus
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: openebs-hostpath

创建完pvc后，我们创建deployment和svc

[root@CentOS8 prometheus]# cat grafana-svc.yaml
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: prometheus
  labels:
    app: grafana
spec:
  type: NodePort
  ports:
    - port: 3000
  selector:
    app: grafana

#Deployment

[root@CentOS8 prometheus]# cat grafana-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: grafana
  namespace: prometheus
  labels:
    app: grafana
spec:
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      containers:
      - name: grafana
        # image: grafana/grafana #官方镜像
        image: registry.cn-hangzhou.aliyuncs.com/urbancabin/grafana:latest
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 3000
          name: grafana
        env:
        - name: GF_SECURITY_ADMIN_USER
          value: admin
        - name: GF_SECURITY_ADMIN_PASSWORD
          value: abcdocker
        readinessProbe:
          failureThreshold: 10
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          initialDelaySeconds: 60
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        livenessProbe:
          failureThreshold: 3
          httpGet:
            path: /api/health
            port: 3000
            scheme: HTTP
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 1
        resources:
          limits:
            cpu: 300m
            memory: 1024Mi
          requests:
            cpu: 300m
            memory: 1024Mi
        volumeMounts:
        - mountPath: /var/lib/grafana
          subPath: grafana
          name: storage
      securityContext:
        fsGroup: 472
        runAsUser: 472
      volumes:
      - name: storage
        persistentVolumeClaim:
          claimName: grafana-data

创建grafana所有yaml

[root@CentOS8 grafana]# kubectl apply -f .

检查grafana

[root@CentOS8 prometheus]# kubectl get all -n prometheus |grep grafana
pod/grafana-7556f7d7df-nzkmt              1/1     Running   0             163m
service/grafana              NodePort    10.108.237.208   <none>        3000:30820/TCP      172m
deployment.apps/grafana              1/1     1            1           172m
replicaset.apps/grafana-7556f7d7df              1         1         1       172m

配置grafana
这里grafana只讲几个重点配置部分
设置Prometheus源
http://prometheus.prometheus.svc.cluster.local:9090
这里的需要修改中间的prometheus，这个prometheus为namespace。不懂的看一下svc这块知识