1、日志
在Kubernetes或者说Docker中都可以查看容器的日志,但如果直接通过命令行的方式去看的话会很麻烦,要不断的去敲命令,去找容器名称,很不方便操作!
在没有使用容器的项目中,我们也知道有ELK这一套组合拳是专门用来操作日志的,那K8S肯定也是可以用它们来进行日志收集、查看的。
1.1 日志查看
1.1.1 命令方式
1.1.1.1 docker
-
docker ps 查看 container id
-
docker logs container_id 查看日志
1.1.1.2 Kubernetes命令
-
kubectl 命令查看
- kubectl logs -f <pod-name> [-c <container-name>]
-
Pod日志
-
kubectl describe pod <pod-name>
kubectl describe除了能够查看pod的日志信息,还能查看比如Node、RC、Service、Namespace等信息。
注意 :要是想查看指定命名空间之下的,可以-n=namespace
-
-
组件服务级别
比如kube-apiserver、kube-schedule、kubelet、kube-proxy、kube-controller-manager等都可以使用journalctl进行查看
如果K8S中出现问题了,一般就可以通过这些命令来查看是不是组件出异常了
1.1.2 ELK
Elasticsearch搜索引擎与名为Logstash的数据收集和日志解析引擎以及名为Kibana的分析和可视化平台一起开发。这三个产品被设计成一个集成解决方案,称为 Elastic Stack(ELK Stack)
。
日志采集可以有多种方式选择,ELK中的Logstash可以由其他具有相同功能的多个组件替换,如Logpilot、fluentd等,本章节我们采用 Logpilot 进行演示。
1.1.2.1 结构图
- Pod中的日志通过mount挂载到宿主机中的某个目录
- Logpilot采集这个目录下日志之后交给Elasticsearch去做搜索引擎的规则
- 最后通过Kibana做可视化的展示
1.1.2.2 部署ELK
1.1.2.2.1 部署LogPilot
-
准备YAML文件
- log-pilot.yaml
由于环境被我弄坏了,我这边重新安装了K8S集群,并且采用了最新版的1.18.2
下面的yaml文件可能会不兼容旧版本,如果不兼容,大家网上搜一份修改下
apiVersion: apps/v1 kind: DaemonSet # 日志采集需要在所有节点都运行,所以采用DaemonSet资源 metadata: name: log-pilot namespace: kube-system labels: k8s-app: log-pilot kubernetes.io/cluster-service: "true" spec: selector: matchLabels: k8s-app: log-es kubernetes.io/cluster-service: "true" version: v1.22 template: metadata: labels: k8s-app: log-es kubernetes.io/cluster-service: "true" version: v1.22 spec: tolerations: # 这里的配置表示允许将资源部署到Master节点中 - key: node-role.kubernetes.io/master effect: NoSchedule containers: - name: log-pilot image: registry.cn-hangzhou.aliyuncs.com/acs/log-pilot:0.9.5-filebeat resources: # 资源限制 limits: memory: 200Mi requests: cpu: 100m memory: 200Mi env: # 定义与Elasticsearch通信的一些环境变量 - name: "FILEBEAT_OUTPUT" value: "elasticsearch" - name: "ELASTICSEARCH_HOST" value: "elasticsearch-api" - name: "ELASTICSEARCH_PORT" value: "9200" - name: "ELASTICSEARCH_USER" value: "elastic" - name: "ELASTICSEARCH_PASSWORD" value: "elastic" volumeMounts: # 挂载日志目录 - name: sock mountPath: /var/run/docker.sock - name: root mountPath: /host readOnly: true - name: varlib mountPath: /var/lib/filebeat - name: varlog mountPath: /var/log/filebeat securityContext: capabilities: add: - SYS_ADMIN terminationGracePeriodSeconds: 30 volumes: - name: sock hostPath: path: /var/run/docker.sock - name: root hostPath: path: / - name: varlib hostPath: path: /var/lib/filebeat type: DirectoryOrCreate - name: varlog hostPath: path: /var/log/filebeat type: DirectoryOrCreate
-
创建资源
[root@master-kubeadm-k8s log]# kubectl apply -f log-pilot.yaml daemonset.extensions/log-pilot created
-
查看资源
[root@master-kubeadm-k8s log]# kubectl get pods -n kube-system -o wide | grep log log-pilot-8f4nv 1/1 Running 0 2m4s 192.168.221.88 worker02-kubeadm-k8s <none> <none> log-pilot-h25fc 1/1 Running 0 2m4s 192.168.16.250 master-kubeadm-k8s <none> <none> [root@master-kubeadm-k8s log]# kubectl get daemonset -n kube-system NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE calico-node 3 3 3 3 3 beta.kubernetes.io/os=linux 41d kube-proxy 3 3 3 3 3 <none> 41d log-pilot 2 2 2 2 2 <none> 26s
1.1.2.2.2 部署Elasticsearch
-
准备YAML文件
- elasticsearch.yaml
注意这里的资源要求还是比较高的,自己注意下系统资源是否够用
apiVersion: v1 kind: Service # 这里的service是为外部访问ElasticSearch提供 metadata: name: elasticsearch-api # 这里的名称要与logPilot中的ElasticHost环境变量一致 namespace: kube-system labels: name: elasticsearch spec: selector: app: es ports: - name: transport port: 9200 protocol: TCP --- apiVersion: v1 kind: Service # # 这里的service是为ElasticSearch集群之间通信提供的 metadata: name: elasticsearch-discovery namespace: kube-system labels: name: elasticsearch spec: selector: app: es ports: - name: transport port: 9300 protocol: TCP --- apiVersion: apps/v1 kind: StatefulSet # 希望Elasticsearch是有序启动的,所以使用StatefulSet metadata: name: elasticsearch namespace: kube-system labels: kubernetes.io/cluster-service: "true" spec: replicas: 3 serviceName: "elasticsearch-service" selector: matchLabels: app: es template: metadata: labels: app: es spec: tolerations: # 同样elasticsearch也部署到Master节点中 - effect: NoSchedule key: node-role.kubernetes.io/master initContainers: # 这里是指在创建容器之前要先进行初始化操作 - name: init-sysctl image: busybox:1.27 command: - sysctl - -w - vm.max_map_count=262144 securityContext: privileged: true containers: - name: elasticsearch image: registry.cn-hangzhou.aliyuncs.com/log-monitor/elasticsearch:v5.5.1 ports: - containerPort: 9200 protocol: TCP - containerPort: 9300 protocol: TCP securityContext: capabilities: add: - IPC_LOCK - SYS_RESOURCE resources: limits: memory: 4000Mi requests: cpu: 100m memory: 2000Mi env: - name: "http.host" value: "0.0.0.0" - name: "network.host" value: "_eth0_" - name: "cluster.name" value: "docker-cluster" - name: "bootstrap.memory_lock" value: "false" - name: "discovery.zen.ping.unicast.hosts" value: "elasticsearch-discovery" - name: "discovery.zen.ping.unicast.hosts.resolve_timeout" value: "10s" - name: "discovery.zen.ping_timeout" value: "6s" - name: "discovery.zen.minimum_master_nodes" value: "2" - name: "discovery.zen.fd.ping_interval" value: "2s" - name: "discovery.zen.no_master_block" value: "write" - name: "gateway.expected_nodes" value: "2" - name: "gateway.expected_master_nodes" value: "1" - name: "transport.tcp.connect_timeout" value: "60s" - name: "ES_JAVA_OPTS" value: "-Xms2g -Xmx2g" livenessProbe: # 健康检查 tcpSocket: port: transport initialDelaySeconds: 20 periodSeconds: 10 volumeMounts: - name: es-data mountPath: /data terminationGracePeriodSeconds: 30 volumes: - name: es-data hostPath: path: /es-data
-
创建资源
[root@master-kubeadm-k8s log]# kubectl apply -f elasticsearch.yaml service/elasticsearch-api created service/elasticsearch-discovery created statefulset.apps/elasticsearch created
-
查看资源
# Pod会进行有序的创建 [root@master-kubeadm-k8s log]# kubectl get pods -n kube-system -o wide | grep elastic elasticsearch-0 1/1 Running 0 4m36s 10.244.221.69 worker02- kubeadm-k8s <none> <none> elasticsearch-1 1/1 Running 0 4m33s 10.244.14.4 worker01-kubeadm-k8s <none> <none> elasticsearch-2 0/1 PodInitializing 0 101s 10.244.16.194 master-kubeadm-k8s <none> <none> [root@master-kubeadm-k8s log]# kubectl get svc -n kube-system -o wide | grep elastic elasticsearch-api ClusterIP 10.104.144.183 <none> 9200/TCP 5m2s app=es elasticsearch-discovery ClusterIP 10.109.137.36 <none> 9300/TCP 5m2s app=es [root@master-kubeadm-k8s log]# kubectl get statefulset -n kube-system NAME READY AGE elasticsearch 3/3 6m14s
1.1.2.2.3 部署Kibana
kibana主要是对外提供访问的,所以这边需要配置Service和Ingress
前提:要有Ingress Controller的支持,比如Nginx Controller
-
准备YAML文件
-
kibana.yaml
# Deployment apiVersion: apps/v1 kind: Deployment metadata: name: kibana namespace: kube-system labels: component: kibana spec: replicas: 1 selector: matchLabels: component: kibana template: metadata: labels: component: kibana spec: containers: - name: kibana image: registry.cn-hangzhou.aliyuncs.com/log-monitor/kibana:v5.5.1 env: - name: CLUSTER_NAME value: docker-cluster - name: ELASTICSEARCH_URL # elasticsearch 地址 value: http://elasticsearch-api:9200/ resources: limits: cpu: 1000m requests: cpu: 100m ports: - containerPort: 5601 name: http --- # Service apiVersion: v1 kind: Service metadata: name: kibana namespace: kube-system labels: component: kibana spec: selector: component: kibana ports: - name: http port: 80 targetPort: http --- # Ingress apiVersion: extensions/v1beta1 kind: Ingress metadata: name: kibana namespace: kube-system spec: rules: - host: log.k8s.sunny.com # 本地hosts配置域名 http: paths: - path: / backend: serviceName: kibana servicePort: 80
-
-
创建资源
[root@master-kubeadm-k8s log]# kubectl apply -f kibana.yaml deployment.apps/kibana created service/kibana created ingress.extensions/kibana created
-
查看资源
[root@master-kubeadm-k8s log]# kubectl get pods -n kube-system | grep kibana kibana-8747dff7d-l627g 1/1 Running 0 2m2s [root@master-kubeadm-k8s log]# kubectl get svc -n kube-system | grep kibana kibana ClusterIP 10.109.177.214 <none> 80/TCP 2m40s [root@master-kubeadm-k8s log]# kubectl get ingress -n kube-system | grep kibana kibana <none> log.k8s.sunny.com 80 2m43s
测试
2、监控
2.1 Prometheus简介
这里的监控是指监控K8S集群的健康状态,包括节点、K8S组件以及Pod都会监控。
Prometheus 是一个开源的监控和警报系统,它直接从目标主机上运行的代理程序中抓取指标,并将收集的样本集中存储在其服务器上。
2016 年 Prometheus 成为继 Kubernetes 之后,成为 CNCF (Cloud Native Computing Foundation)中的第二个项目成员。
2.1.2 主要功能
- 多维 数据模型(时序由 metric 名字和 k/v 的 labels 构成)。
- 灵活的查询语句(PromQL)。
- 无依赖存储,支持 local 和 remote 不同模型。
- 采用 http 协议,使用 pull 模式,拉取数据,简单易懂。
- 监控目标,可以采用服务发现或静态配置的方式。
- 支持多种统计数据模型,图形化友好。
2.1.3 Prometheus架构
从这个架构图,也可以看出 Prometheus 的主要模块包含:Server、Exporters、Pushgateway、PromQL、Alertmanager、WebUI 等。
它大致使用逻辑是这样:
Prometheus server 定期从静态配置的 target 或者服务发现的 target 拉取数据。
当新拉取的数据大于配置内存缓存区的时候,Prometheus 会将数据持久化到磁盘(如果使用 remote storage 将持久化到云端)。
Prometheus 可以配置 rule,然后定时查询数据,当条件触发的时候,会将 alert 推送到配置的 Alertmanager。
Alertmanager 收到警告的时候,可以根据配置,聚合、去重、降噪,最后发送警告。
可以使用 API、Prometheus Console 或者 Grafana 查询和聚合数据。
2.1.4 Prometheus知识普及
- 支持pull、push数据添加方式
- 支持k8s服务发现
- 提供查询语言PromQL
- 时序(time series)是由名字(Metric)以及一组key/value标签定义的数据类型
2.2 数据采集
- 服务器数据
- 在每一个节点中部署工具:Node-Exporter
- K8S组件数据
- 访问集群中不同端口下的metrics接口即可拿到
- 比如
- IP:2379/metrics 拿到ETCD数据
- IP:6443/metrics 拿到apiServer数据
- IP:10252/metrics 拿到controller manager数据
- IP:10251/metrics 拿到scheduler数据
- 容器数据
- 在kubelet中有个cAdvisor组件,默认就可以拿到容器的数据
2.2.1 服务器数据采集
Prometheus可以从Kubernetes集群的各个组件中采集数据,比如kubelet中自带的cAdvisor,api-server等,而node-export就是其中一种来源。
Exporter是Prometheus的一类数据采集组件的总称。它负责从目标处搜集数据,并将其转化为Prometheus支持的格式。
服务器的指标采集是通过 Node-Exporter 进行采集,比如服务器CPU、内存、磁盘、I/O等信息
2.2.1.1 部署NodeExporter
-
准备YAML文件
-
namespace.yaml
我们将监控的资源都放到这个命名空间下,方便管理
apiVersion: v1 kind: Namespace metadata: name: ns-monitor labels: name: ns-monitor
-
node-exporter.yaml
kind: DaemonSet apiVersion: apps/v1 metadata: labels: app: node-exporter name: node-exporter namespace: ns-monitor spec: revisionHistoryLimit: 10 selector: matchLabels: app: node-exporter template: metadata: labels: app: node-exporter spec: containers: - name: node-exporter image: prom/node-exporter:v0.16.0 ports: - containerPort: 9100 protocol: TCP name: http hostNetwork: true hostPID: true tolerations: - effect: NoSchedule operator: Exists --- # 本来NodeExporter是不需要对外提供访问的,但我们这里一步一步来,先保证每一步都正确再往后进行 kind: Service apiVersion: v1 metadata: labels: app: node-exporter name: node-exporter-service namespace: ns-monitor spec: ports: - name: http port: 9100 nodePort: 31672 protocol: TCP type: NodePort selector: app: node-exporter
-
-
创建资源
[root@master-kubeadm-k8s prometheus]# kubectl apply -f node-exporter.yaml daemonset.apps/node-exporter created service/node-exporter-service created
-
查看资源
[root@master-kubeadm-k8s prometheus]# kubectl get pods -n ns-monitor NAME READY STATUS RESTARTS AGE node-exporter-dsjbq 1/1 Running 0 2m32s node-exporter-mdnrj 1/1 Running 0 2m32s node-exporter-sxwxx 1/1 Running 0 2m32s [root@master-kubeadm-k8s prometheus]# kubectl get svc -n ns-monitor NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE node-exporter-service NodePort 10.109.226.6 <none> 9100:31672/TCP 2m46s
-
测试
2.2.1 部署Prometheus
-
准备YAML文件
-
prometheus.yaml
prometheus的yaml文件很值得学习,很多常用的资源类型这里都用到了
apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: prometheus rules: - apiGroups: [""] # "" indicates the core API group resources: - nodes - nodes/proxy - services - endpoints - pods verbs: - get - watch - list - apiGroups: - extensions resources: - ingresses verbs: - get - watch - list - nonResourceURLs: ["/metrics"] verbs: - get --- apiVersion: v1 kind: ServiceAccount metadata: name: prometheus namespace: ns-monitor labels: app: prometheus --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: prometheus subjects: - kind: ServiceAccount name: prometheus namespace: ns-monitor roleRef: kind: ClusterRole name: prometheus apiGroup: rbac.authorization.k8s.io --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-conf namespace: ns-monitor labels: app: prometheus data: prometheus.yml: |- # my global config global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - targets: # - alertmanager:9093 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: # - "first_rules.yml" # - "second_rules.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. scrape_configs: # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. - job_name: 'prometheus' # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['localhost:9090'] - job_name: 'grafana' static_configs: - targets: - 'grafana-service.ns-monitor:3000' - job_name: 'kubernetes-apiservers' kubernetes_sd_configs: - role: endpoints # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # <kubernetes_sd_config>. tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt # If your node certificates are self-signed or use a different CA to the # master CA, then disable certificate verification below. Note that # certificate verification is an integral part of a secure infrastructure # so this should only be disabled in a controlled environment. You can # disable certificate verification by uncommenting the line below. # # insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token # Keep only the default/kubernetes service endpoints for the https port. This # will add targets for each API server which Kubernetes adds an endpoint to # the default/kubernetes service. relabel_configs: - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] action: keep regex: default;kubernetes;https # Scrape config for nodes (kubelet). # # Rather than connecting directly to the node, the scrape is proxied though the # Kubernetes apiserver. This means it will work if Prometheus is running out of # cluster, or can't connect to nodes for some other reason (e.g. because of # firewalling). - job_name: 'kubernetes-nodes' # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # <kubernetes_sd_config>. tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics # Scrape config for Kubelet cAdvisor. # # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics # (those whose names begin with 'container_') have been removed from the # Kubelet metrics endpoint. This job scrapes the cAdvisor endpoint to # retrieve those metrics. # # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics" # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with # the --cadvisor-port=0 Kubelet flag). # # This job is not necessary and should be removed in Kubernetes 1.6 and # earlier versions, or it will cause the metrics to be scraped twice. - job_name: 'kubernetes-cadvisor' # Default to scraping over https. If required, just disable this or change to # `http`. scheme: https # This TLS & bearer token file config is used to connect to the actual scrape # endpoints for cluster components. This is separate to discovery auth # configuration because discovery & scraping are two separate concerns in # Prometheus. The discovery auth config is automatic if Prometheus runs inside # the cluster. Otherwise, more config options have to be provided within the # <kubernetes_sd_config>. tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - target_label: __address__ replacement: kubernetes.default.svc:443 - source_labels: [__meta_kubernetes_node_name] regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor # Scrape config for service endpoints. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/scrape`: Only scrape services that have a value of `true` # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need # to set this to `https` & most likely set the `tls_config` of the scrape config. # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: If the metrics are exposed on a different port to the # service then set this appropriately. - job_name: 'kubernetes-service-endpoints' kubernetes_sd_configs: - role: endpoints relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] action: replace target_label: __scheme__ regex: (https?) - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] action: replace target_label: __address__ regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] action: replace target_label: kubernetes_name # Example scrape config for probing services via the Blackbox Exporter. # # The relabeling allows the actual service scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/probe`: Only probe services that have a value of `true` - job_name: 'kubernetes-services' metrics_path: /probe params: module: [http_2xx] kubernetes_sd_configs: - role: service relabel_configs: - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__address__] target_label: __param_target - target_label: __address__ replacement: blackbox-exporter.example.com:9115 - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_service_name] target_label: kubernetes_name # Example scrape config for probing ingresses via the Blackbox Exporter. # # The relabeling allows the actual ingress scrape endpoint to be configured # via the following annotations: # # * `prometheus.io/probe`: Only probe services that have a value of `true` - job_name: 'kubernetes-ingresses' metrics_path: /probe params: module: [http_2xx] kubernetes_sd_configs: - role: ingress relabel_configs: - source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe] action: keep regex: true - source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path] regex: (.+);(.+);(.+) replacement: ${1}://${2}${3} target_label: __param_target - target_label: __address__ replacement: blackbox-exporter.example.com:9115 - source_labels: [__param_target] target_label: instance - action: labelmap regex: __meta_kubernetes_ingress_label_(.+) - source_labels: [__meta_kubernetes_namespace] target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_ingress_name] target_label: kubernetes_name # Example scrape config for pods # # The relabeling allows the actual pod scrape endpoint to be configured via the # following annotations: # # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the # pod's declared ports (default is a port-free target if none are declared). - job_name: 'kubernetes-pods' kubernetes_sd_configs: - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep regex: true - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] action: replace target_label: __metrics_path__ regex: (.+) - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] action: replace regex: ([^:]+)(?::\d+)?;(\d+) replacement: $1:$2 target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - source_labels: [__meta_kubernetes_namespace] action: replace target_label: kubernetes_namespace - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: ns-monitor labels: app: prometheus data: cpu-usage.rule: | groups: - name: NodeCPUUsage rules: - alert: NodeCPUUsage expr: (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 for: 2m labels: severity: "page" annotations: summary: "{{$labels.instance}}: High CPU usage detected" description: "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" --- apiVersion: v1 kind: PersistentVolume metadata: name: "prometheus-data-pv" labels: name: prometheus-data-pv release: stable spec: capacity: storage: 5Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Recycle nfs: path: /nfs/data/prometheus # 定义持久化存储目录 server: 192.168.50.111 # NFS服务器 --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: prometheus-data-pvc namespace: ns-monitor spec: accessModes: - ReadWriteOnce resources: requests: storage: 5Gi selector: matchLabels: name: prometheus-data-pv release: stable --- kind: Deployment apiVersion: apps/v1 metadata: labels: app: prometheus name: prometheus namespace: ns-monitor spec: replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: prometheus template: metadata: labels: app: prometheus spec: serviceAccountName: prometheus securityContext: runAsUser: 0 containers: - name: prometheus image: prom/prometheus:latest imagePullPolicy: IfNotPresent volumeMounts: - mountPath: /prometheus name: prometheus-data-volume - mountPath: /etc/prometheus/prometheus.yml name: prometheus-conf-volume subPath: prometheus.yml - mountPath: /etc/prometheus/rules name: prometheus-rules-volume ports: - containerPort: 9090 protocol: TCP volumes: - name: prometheus-data-volume persistentVolumeClaim: claimName: prometheus-data-pvc - name: prometheus-conf-volume configMap: name: prometheus-conf - name: prometheus-rules-volume configMap: name: prometheus-rules tolerations: - key: node-role.kubernetes.io/master effect: NoSchedule --- kind: Service apiVersion: v1 metadata: annotations: prometheus.io/scrape: 'true' labels: app: prometheus name: prometheus-service namespace: ns-monitor spec: ports: - port: 9090 targetPort: 9090 selector: app: prometheus type: NodePort
-
创建资源
[root@master-kubeadm-k8s prometheus]# kubectl apply -f prometheus.yaml clusterrole.rbac.authorization.k8s.io/prometheus created serviceaccount/prometheus created clusterrolebinding.rbac.authorization.k8s.io/prometheus created configmap/prometheus-conf created configmap/prometheus-rules created persistentvolume/prometheus-data-pv created persistentvolumeclaim/prometheus-data-pvc created deployment.apps/prometheus created service/prometheus-service created
-
查看资源
[root@master-kubeadm-k8s prometheus]# kubectl get pods -n ns-monitor NAME READY STATUS RESTARTS AGE node-exporter-dsjbq 1/1 Running 1 26m node-exporter-mdnrj 1/1 Running 1 26m node-exporter-sxwxx 1/1 Running 2 26m prometheus-5f7cb6d955-mm8d2 1/1 Running 1 28s [root@master-kubeadm-k8s prometheus]# kubectl get pv -n ns-monitor NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE prometheus-data-pv 5Gi RWO Recycle Bound ns-monitor/prometheus-data-pvc 53s [root@master-kubeadm-k8s prometheus]# kubectl get pvc -n ns-monitor NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE prometheus-data-pvc Bound prometheus-data-pv 5Gi RWO 60s [root@master-kubeadm-k8s prometheus]# kubectl get svc -n ns-monitor NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE node-exporter-service NodePort 10.109.226.6 <none> 9100:31672/TCP 27m prometheus-service NodePort 10.101.128.125 <none> 9090:31615/TCP 64s
-
测试
-
2.2.4 部署Grafana监控UI
-
准备YAML文件
-
grafana.yaml
apiVersion: v1 kind: PersistentVolume metadata: name: "grafana-data-pv" labels: name: grafana-data-pv release: stable spec: capacity: storage: 5Gi accessModes: - ReadWriteOnce persistentVolumeReclaimPolicy: Recycle nfs: path: /nfs/data/grafana server: 192.168.50.111 --- apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana-data-pvc namespace: ns-monitor spec: accessModes: - ReadWriteOnce resources: requests: storage: 5Gi selector: matchLabels: name: grafana-data-pv release: stable --- kind: Deployment apiVersion: apps/v1 metadata: labels: app: grafana name: grafana namespace: ns-monitor spec: replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: grafana template: metadata: labels: app: grafana spec: securityContext: runAsUser: 0 containers: - name: grafana image: grafana/grafana:latest imagePullPolicy: IfNotPresent env: - name: GF_AUTH_BASIC_ENABLED value: "true" - name: GF_AUTH_ANONYMOUS_ENABLED value: "false" readinessProbe: httpGet: path: /login port: 3000 volumeMounts: - mountPath: /var/lib/grafana name: grafana-data-volume ports: - containerPort: 3000 protocol: TCP volumes: - name: grafana-data-volume persistentVolumeClaim: claimName: grafana-data-pvc --- kind: Service apiVersion: v1 metadata: labels: app: grafana name: grafana-service namespace: ns-monitor spec: ports: - port: 3000 targetPort: 3000 selector: app: grafana type: NodePort
-
grafana-ingress.yaml
#ingress apiVersion: extensions/v1beta1 kind: Ingress metadata: name: grafana-ingress namespace: ns-monitor spec: rules: - host: monitor.k8s.sunny.com http: paths: - path: / backend: serviceName: grafana-service servicePort: 3000
-
-
创建资源
[root@master-kubeadm-k8s prometheus]# kubectl apply -f grafana.yaml persistentvolume/grafana-data-pv created persistentvolumeclaim/grafana-data-pvc created deployment.apps/grafana created service/grafana-service created [root@master-kubeadm-k8s prometheus]# kubectl apply -f grafana-ingress.yaml ingress.extensions/grafana-ingress created
-
查看资源
[root@master-kubeadm-k8s prometheus]# kubectl get deploy -n ns-monitor NAME READY UP-TO-DATE AVAILABLE AGE grafana 1/1 1 1 2m52s prometheus 1/1 1 1 6m41s [root@master-kubeadm-k8s prometheus]# kubectl get pv -n ns-monitor NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE grafana-data-pv 5Gi RWO Recycle Bound ns-monitor/grafana-data-pvc 3m10s prometheus-data-pv 5Gi RWO Recycle Bound ns-monitor/prometheus-data-pvc 7m [root@master-kubeadm-k8s prometheus]# kubectl get pvc -n ns-monitor NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE grafana-data-pvc Bound grafana-data-pv 5Gi RWO 3m14s prometheus-data-pvc Bound prometheus-data-pv 5Gi RWO 7m4s [root@master-kubeadm-k8s prometheus]# kubectl get svc -n ns-monitor NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE grafana-service NodePort 10.111.192.206 <none> 3000:31828/TCP 3m5s node-exporter-service NodePort 10.109.226.6 <none> 9100:31672/TCP 33m prometheus-service NodePort 10.101.128.125 <none> 9090:31615/TCP 7m4s [root@master-kubeadm-k8s prometheus]# kubectl get ingress -n ns-monitor NAME CLASS HOSTS ADDRESS PORTS AGE grafana-ingress <none> monitor.k8s.sunny.com 80 2m15s
-
测试
账号密码都是 admin