环境信息
本次环境信息如下
Kubernetes 1.24.3
openebs openebs.io/version: 3.3.0
安装openebs
kubectl apply -f https://openebs.github.io/charts/openebs-operator.yaml
[root@CentOS8 prometheus]# kubectl get sc
NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
openebs-device openebs.io/local Delete WaitForFirstConsumer false 150m
openebs-hostpath openebs.io/local Delete WaitForFirstConsumer false 150m
[root@CentOS8 prometheus]# kubectl get all -n openebs
NAME READY STATUS RESTARTS AGE
pod/openebs-localpv-provisioner-dd977fdd5-7h94d 1/1 Running 0 152m
pod/openebs-ndm-85v85 1/1 Running 0 152m
pod/openebs-ndm-cluster-exporter-866c974856-sgksr 1/1 Running 0 152m
pod/openebs-ndm-node-exporter-hxp2q 1/1 Running 0 152m
pod/openebs-ndm-operator-85886744bb-pbt9p 1/1 Running 0 152m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/openebs-ndm-cluster-exporter-service ClusterIP None <none> 9100/TCP 152m
service/openebs-ndm-node-exporter-service ClusterIP None <none> 9101/TCP 152m
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
daemonset.apps/openebs-ndm 1 1 1 1 1 <none> 152m
daemonset.apps/openebs-ndm-node-exporter 1 1 1 1 1 <none> 152m
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/openebs-localpv-provisioner 1/1 1 1 152m
deployment.apps/openebs-ndm-cluster-exporter 1/1 1 1 152m
deployment.apps/openebs-ndm-operator 1/1 1 1 152m
NAME DESIRED CURRENT READY AGE
replicaset.apps/openebs-localpv-provisioner-dd977fdd5 1 1 1 152m
replicaset.apps/openebs-ndm-cluster-exporter-866c974856 1 1 1 152m
replicaset.apps/openebs-ndm-operator-85886744bb 1 1 1 152m
创建PVC持久化目录
[root@CentOS8 prometheus]# kubectl create ns prometheus
namespace/prometheus created
[root@CentOS8 prometheus]# cat prometheus-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: prometheus-data-db #pvc名称,这里不建议修改
namespace: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi #创建pvc大小
storageClassName: openebs-hostpath #这里是我的storageclass,请根据自己的实际情况修改
#创建pvc
[root@CentOS8 prometheus]# kubectl apply -f prometheus-pvc.yaml
persistentvolumeclaim/prometheus-data-db created
#查看pvc创建情况
[root@CentOS8 prometheus]# kubectl get pvc -n prometheus
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
prometheus-data-db Bound pvc-fad4f75b-b103-4a27-bd73-a5adb64f7308 50Gi RWO openebs-hostpath 142m
安装Prometheus
首先需要配置configmap
[root@CentOS8 prometheus]# cat prometheus.configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prometheus
data:
prometheus.yml: |
global:
scrape_interval: 15s
scrape_timeout: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'kubernetes-node'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::d+)?;(d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: kubernetes-nodes-cadvisor
scrape_interval: 10s
scrape_timeout: 10s
scheme: https # remove if you want to scrape metrics on insecure port
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
metric_relabel_configs:
- action: replace
source_labels: [id]
regex: '^/machine.slice/machine-rkt\x2d([^\]+)\.+/([^/]+).service$'
target_label: rkt_container_name
replacement: '${2}-${1}'
- action: replace
source_labels: [id]
regex: '^/system.slice/(.+).service$'
target_label: systemd_service_name
replacement: '${1}'
- job_name: kube-state-metrics
static_configs:
- targets: ['kube-state-metrics.prometheus.svc.cluster.local:8080']
接下来配置rbac授权文件,否则Deployment会没有权限创建pod
[root@CentOS8 prometheus]# cat prometheus-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
创建prometheus容器,以Deployment方式运行
[root@CentOS8 prometheus]# cat prometheus.deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
nodeSelector:
node-role.kubernetes.io/master: ""
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
containers:
# - image: prom/prometheus:latest #官方镜像地址
- image: registry.cn-hangzhou.aliyuncs.com/urbancabin/prometheus:latest
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=30d"
- "--web.enable-admin-api" # 控制对admin HTTP API的访问,其中包括删除时间序列等功能
- "--web.enable-lifecycle" # 支持热更新,直接执行localhost:9090/-/reload立即生效
ports:
- containerPort: 9090
protocol: TCP
name: http
volumeMounts:
- mountPath: "/prometheus"
subPath: prometheus
name: data
- mountPath: "/etc/prometheus"
name: config-volume
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
cpu: 100m
memory: 512Mi
securityContext:
runAsUser: 0
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus-data-db
- configMap:
name: prometheus-config
name: config-volume
接下来我们还需要给prometheus创建pvc(我这里后面grafana和prometheus会分开,所以我使用NodePort。如果有ingress可以使用cluster)
[root@CentOS8 prometheus]# cat prometheus-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prometheus
labels:
app: prometheus
spec:
selector:
app: prometheus
type: NodePort
ports:
- name: web
port: 9090
targetPort: http
接下来在给我们K8s节点设置node-exporter
[root@CentOS8 prometheus]# cat prometheus-node.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: prometheus
labels:
name: node-exporter
spec:
selector:
matchLabels:
name: node-exporter
template:
metadata:
labels:
name: node-exporter
spec:
hostPID: true
hostIPC: true
hostNetwork: true
containers:
- name: node-exporter
#image: prom/node-exporter:v0.16.0 #官方地址
image: registry.cn-hangzhou.aliyuncs.com/urbancabin/node-exporter:v0.16.0
ports:
- containerPort: 9100
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
args:
- --path.procfs
- /host/proc
- --path.sysfs
- /host/sys
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
tolerations:
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
当我们所以配置文件都设置好后,我们直接kubectl apply -f .即可
检查pod运行状态
[root@CentOS8 prometheus]# kubectl get all -n prometheus
NAME READY STATUS RESTARTS AGE
pod/grafana-7556f7d7df-nzkmt 1/1 Running 0 153m
pod/kube-state-metrics-5f8785787d-cw8ln 1/1 Running 0 170m
pod/node-exporter-pln47 1/1 Running 0 18h
pod/prometheus-584b95655f-h4krl 1/1 Running 9 (4m6s ago) 3h32m
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
service/grafana NodePort 10.108.237.208 <none> 3000:30820/TCP 162m
service/kube-state-metrics ClusterIP 10.96.242.162 <none> 8080/TCP,8081/TCP 170m
service/prometheus NodePort 10.96.81.46 <none> 9090:32440/TCP 18h
NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE
daemonset.apps/node-exporter 1 1 1 1 1 <none> 18h
NAME READY UP-TO-DATE AVAILABLE AGE
deployment.apps/grafana 1/1 1 1 162m
deployment.apps/kube-state-metrics 1/1 1 1 170m
deployment.apps/prometheus 1/1 1 1 18h
NAME DESIRED CURRENT READY AGE
replicaset.apps/grafana-7556f7d7df 1 1 1 162m
replicaset.apps/kube-state-metrics-5f8785787d 1 1 1 170m
replicaset.apps/prometheus-584b95655f 1 1 1 18h
访问prometheus测试
kube-state-metrics
Kube-State-Metrics简介
kube-state-metrics 通过监听 API Server 生成有关资源对象的状态指标,比如 Deployment 、Node 、 Pod ,需要注意的是 kube-state-metrics 只是简单的提供一个 metrics 数据,并不会存储这些指标数据,所以我们可以使用 Prometheus 来抓取这些数据然后存储,主要关注的是业务相关的一些元数据,比如 Deployment 、 Pod 、副本状态等;调度了多少个 replicas ?现在可用的有几个?多少个 Pod 是running/stopped/terminated 状态? Pod 重启了多少次?我有多少 job 在运行中
yaml文件如下
本地命名空间为prometheus
[root@CentOS8 prometheus]# cat kube-metrics-deployment.yaml
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
name: kube-state-metrics
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
name: kube-state-metrics
rules:
- apiGroups:
- ""
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- certificates.k8s.io
resources:
- certificatesigningrequests
verbs:
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: prometheus
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scraped: "true" # 设置能被prometheus抓取到,因为不带这个annotation prometheus-service-endpoints 不会去抓这个metrics
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
name: kube-state-metrics
namespace: prometheus
spec:
# clusterIP: None # 允许通过svc来进行访问
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry
selector:
app.kubernetes.io/name: kube-state-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
name: kube-state-metrics
namespace: prometheus
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.4.2
spec:
nodeName: centos8 # 设置在k8s-master-1上运行
tolerations: # 设置能容忍在master节点运行
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule"
automountServiceAccountToken: true
containers:
# - image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.4.2
- image: anjia0532/google-containers.kube-state-metrics.kube-state-metrics:v2.4.2
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
readinessProbe:
httpGet:
path: /
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsUser: 65534
serviceAccountName: kube-state-metrics
[root@CentOS8 prometheus]# kubectl get pod -n prometheus
NAME READY STATUS RESTARTS AGE
grafana-7556f7d7df-nzkmt 1/1 Running 0 156m
kube-state-metrics-5f8785787d-cw8ln 1/1 Running 0 173m
node-exporter-pln47 1/1 Running 0 18h
prometheus-584b95655f-h4krl 1/1 Running 9 (7m14s ago) 3h35m
检查svc是否有metric数据
[root@CentOS8 prometheus]# curl http://xxx:32440/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.5208e-05
go_gc_duration_seconds{quantile="0.25"} 0.000213094
go_gc_duration_seconds{quantile="0.5"} 0.094531627
go_gc_duration_seconds{quantile="0.75"} 0.105158749
go_gc_duration_seconds{quantile="1"} 0.303282049
go_gc_duration_seconds_sum 1.715240602
go_gc_duration_seconds_count 23
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 93
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.17.5"} 1
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
......
Grafana
接下来安装Grafana,同样也是基于原来的文章,只不过将Grafana做持久化存储
[root@CentOS8 prometheus]# cat grafana-pvc.yaml
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: grafana-data
namespace: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
storageClassName: openebs-hostpath
创建完pvc后,我们创建deployment和svc
[root@CentOS8 prometheus]# cat grafana-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
spec:
type: NodePort
ports:
- port: 3000
selector:
app: grafana
#Deployment
[root@CentOS8 prometheus]# cat grafana-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
spec:
revisionHistoryLimit: 10
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- name: grafana
# image: grafana/grafana #官方镜像
image: registry.cn-hangzhou.aliyuncs.com/urbancabin/grafana:latest
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: abcdocker
readinessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
limits:
cpu: 300m
memory: 1024Mi
requests:
cpu: 300m
memory: 1024Mi
volumeMounts:
- mountPath: /var/lib/grafana
subPath: grafana
name: storage
securityContext:
fsGroup: 472
runAsUser: 472
volumes:
- name: storage
persistentVolumeClaim:
claimName: grafana-data
创建grafana所有yaml
[root@CentOS8 grafana]# kubectl apply -f .
检查grafana
[root@CentOS8 prometheus]# kubectl get all -n prometheus |grep grafana
pod/grafana-7556f7d7df-nzkmt 1/1 Running 0 163m
service/grafana NodePort 10.108.237.208 <none> 3000:30820/TCP 172m
deployment.apps/grafana 1/1 1 1 172m
replicaset.apps/grafana-7556f7d7df 1 1 1 172m
配置grafana
这里grafana只讲几个重点配置部分
设置Prometheus源
http://prometheus.prometheus.svc.cluster.local:9090
这里的需要修改中间的prometheus,这个prometheus为namespace。不懂的看一下svc这块知识
推荐K8s监控模板
https://grafana.com/grafana/dashboards/15661
https://grafana.com/grafana/dashboards/6417
https://grafana.com/grafana/dashboards/16098