基于文件的服务发现
prometheus配置
- job_name: 'file_sd'
file_sd_configs:
- files: ['/opt/monitor/prometheus/sd_config/*.yml']
文件配置
[root@prometheus sd_config]# vi test.yml
- targets: ['192.168.153.17:8080','192.168.153.20:8080']
基于Consul的服务发现
架构图
启动consul服务
[root@prometheus ~]# docker run --name consul -d -p 8500:8500 consul
http://192.168.153.20:8500
在consul中注册服务
curl -X PUT -d '{"id": "Linux-1","name": "Linux","address": "192.168.153.20","port":
9100,"tags": ["service"],"checks": [{"http": "http://192.168.153.20:9100","interval":
"5s"}]}' http://192.168.153.20:8500/v1/agent/service/register
prometheus配置
- job_name: 'conful_sd'
consul_sd_configs:
- server: 192.168.153.20:8500
services: ['Linux']
Ansible+Consul实现100台主机自动监控
基本步骤
ansible批量部署node_exporter
基于consul的服务发现
将node_exporter所在机器的IP和端口注册到consul里
prometheus从consul获取所有ip和端口自动加入监控
分组样式
id:web1,name:webservers
id:web2,name:webservers
id:db1,name:dbservers
id:db2,name:dbservers
- job_name: 'webservers'
consul_sd_configs:
- server: 192.168.153.20:8500
services: ['webservers']
- job_name: 'dbservers'
consul_sd_configs:
- server: 192.168.153.20:8500
services: ['dbservers']
node_exporter.service
[root@ansible ansible]# vi node_exporter.service
[Unit]
Description=node_exporter
[Service]
ExecStart=/usr/local/node_exporter/node_exporter
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
hosts
[root@ansible ansible]# vi hosts
[webservers]
192.168.153.17 name=web1 ansible_ssh_user=root ansible_ssh_pass='root'
[dbservers]
192.168.153.20 name=db1 ansible_ssh_user=root ansible_ssh_pass='root'
consul-register.sh
[root@ansible ansible]# vi consul-register.sh
#!/bin/bash
service_name=$1
instance_id=$2
ip=$3
port=$4
curl -X PUT -d '{"id": "'"$instance_id"'","name": "'"$service_name"'","address": "'"$ip"'","port":'"$port"',"tags": ["'"$service_name"'"],"checks": [{"http": "http://'"$ip"':'"$port"'","interval":"5s"}]}' http://192.168.153.20:8500/v1/agent/service/register
playbook.yaml
[root@ansible ansible]# vi playbook.yaml
- hosts: dbservers
gather_facts: no
tasks:
- name: 推送二进制文件
unarchive: src=node_exporter-1.0.1.linux-amd64.tar.gz dest=/usr/local
- name: 重命名
shell: |
cd /usr/local
if [! -d node_exporter]; then
mv node_exporter-1.0.1.linux-amd64.tar.gz node_exporter
fi
- name: 复制systemd文件
copy: src=node_exporter.service dest=/usr/lib/systemd/system
- name: 启动服务
systemd: name=node_exporter state=restarted daemon_reload=yes
- name: 推送注册脚本
copy: src=consul-register.sh dest=/usr/local/bin/
- name: 注册当前节点
shell: /bin/bash /usr/local/bin/consul-register.sh {{group_names[0]}} {{name}} {{inventory_hostname}} 9100
[root@ansible ansible]# vi playbook.yaml
- hosts: webservers
gather_facts: no
tasks:
- name: 推送二进制文件
unarchive: src=node_exporter-1.0.1.linux-amd64.tar.gz dest=/usr/local
- name: 重命名
shell: |
cd /usr/local
if [! -d node_exporter]; then
mv node_exporter-1.0.1.linux-amd64.tar.gz node_exporter
fi
- name: 复制systemd文件
copy: src=node_exporter.service dest=/usr/lib/systemd/system
- name: 启动服务
systemd: name=node_exporter state=restarted daemon_reload=yes
- name: 推送注册脚本
copy: src=consul-register.sh dest=/usr/local/bin/
- name: 注册当前节点
shell: /bin/bash /usr/local/bin/consul-register.sh {{group_names[0]}} {{name}} {{inventory_hostname}} 9100
执行playbook(webservers,dbservers)
[root@ansible ansible]# ansible-playbook playbook.yaml -i hosts
监控Kubernetes
架构图
K8s RBAC授权
[root@k8smaster prometheus]# kubectl apply -f rbac.yaml
获取Token并保存到文件
#获取prometheus的secrets:- name
[root@k8smaster prometheus]# kubectl get sa prometheus -n kube-system -o yaml
...
secrets:
- name: prometheus-token-r8xrb
#生产token密钥文件
[root@k8smaster prometheus]# kubectl describe secret prometheus-token-r8xrb -n kube-system
[root@k8smaster prometheus]# kubectl describe secret prometheus-token-r8xrb -n kube-system > token.k8s
#修改token.k8s,只保留token
#将文件复制到prometheus文件下
[root@k8smaster prometheus]# cp -r token.k8s /opt/monitor/prometheus/
获取k8s的master地址
[root@k8smaster ~]# kubectl get ep
NAME ENDPOINTS AGE
kubernetes 192.168.153.21:6443 11h
监控K8s集群Pod
整体流程
#prometheus从Aprserver获取信息
prometheus - Aprserver(192.168.153.21:6443)
#Aprserver获取信息从kubelet自带的cadvisor获取相应指标
Aprserver(192.168.153.21:6443) -kubelet(cadvisor)
-----------------------------------------------------------------------
prometheus - Aprserver(192.168.153.21:6443) -kubelet(cadvisor)
创建Job和kubeconfig_sd_configs
- job_name: kubernetes-nodes-cadvisor
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- role: node
api_server: https://192.168.153.21:6443
bearer_token_file: /opt/monitor/prometheus/token.k8s
tls_config:
insecure_skip_verify: true
bearer_token_file: /opt/monitor/prometheus/token.k8s
tls_config:
insecure_skip_verify: true
relabel_configs:
# 将标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_node_label_(.*)
# 修改NodeIP:10250为APIServerIP:6443
- action: replace
regex: (.*)
source_labels: ["__address__"]
target_label: __address__
replacement: 192.168.153.21:6443
# 实际访问指标接口 https://NodeIP:10250/metrics/cadvisor 这个接口只能APISERVER访问,故此重新标记标签使用APISERVER代理访问
- action: replace
source_labels: [__meta_kubernetes_node_name]
target_label: __metrics_path__
regex: (.*)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
---------------------------------------------------------------------------------
注意:master的ip地址
token.k8s的位置
Grafana导入仪表盘
监控K8s资源对象状态
整体流程
prometheus - [kube-state-metrics] -kubelet(cadvisor)
安装部署kube-state-metrics
#启动kube-state-metrics服务
[root@k8smaster prometheus]# kubectl apply -f kube-state-metrics.yaml
#查看服务状态
[root@k8smaster prometheus]# kubectl get pods -n kube-system
kube-state-metrics-5d8b77488c-9hb5j 2/2 Running 0 112s
[root@k8smaster prometheus]# kubectl describe pod kube-state-metrics-5d8b77488c-9hb5j -n kube-system
[root@k8smaster prometheus]# kubectl get pods -n kube-system -o wide
kube-state-metrics-5d8b77488c-9hb5j 2/2 Running 0 7m39s 10.244.249.6 k8snode1 <none> <none>
[root@k8smaster prometheus]# curl 10.244.249.6:8080/metrics
---------------------------------------------------------------------------
注意:10.244.249.6:8080中的8080是kube-state-metrics.yaml文件里设置的http-metrics
创建Job和kubeconfig_sd_configs
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
api_server: https://192.168.153.21:6443
bearer_token_file: /opt/monitor/prometheus/token.k8s
tls_config:
insecure_skip_verify: true
bearer_token_file: /opt/monitor/prometheus/token.k8s
tls_config:
insecure_skip_verify: true
# Service没配置注解prometheus.io/scrape的不采集
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
# 重命名采集目标协议
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
# 重命名采集目标指标URL路径
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
# 重命名采集目标地址
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
# 将K8s标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# 生成命名空间标签
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
# 生成Service名称标签
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_service_name
--------------------------------------------------------------------------
注意:master的ip地址
token.k8s的位置
Grafana导入仪表盘