1、解压包
#server
wget https://github.com/prometheus/prometheus/releases/download/v2.41.0/prometheus-2.41.0.linux-amd64.tar.gz
#alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
#nodeExporter
wget https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
2、安装prometheus-server
2.1、配置prometheus-server
# my global config
global:
scrape_interval: 10s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 10s # Evaluate rules every 15 seconds. The default is every 1 minute. 刷新监控规则
# Alertmanager的访问地址
alerting:
alertmanagers:
- static_configs:
- targets: ['127.0.0.1:9093']
rule_files:
- "/opt/prometheus/rules_conf/*.yml"
scrape_configs:
# prometheus server访问地址
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
#自动发现有多种方式,这里根据文件配置自动发现主机
- job_name: "node-exporter-discovery"
file_sd_configs:
- refresh_interval: 1m
files:
- /opt/prometheus/node_conf/node_exporter.yaml
node_exporter.ymal配置文件说明
- targets:
- 127.0.0.1:9100
labels:
idc: prd
2.3、启动prometheus-server
2.3.1、添加服务
cat > /usr/lib/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus
After=network.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.path=/data1/prometheus/data --storage.tsdb.retention.time=7d --storage.tsdb.retention.size=10G --log.level=info
[Install]
WantedBy=multi-user.target
EOF
备注说明
--web.enable-lifecycle #开启url刷新配置功能
--storage.tsdb.max-block-duration=2d #配置tsdb最大文件块时长2d
--storage.tsdb.min-block-duration=2h #配置tsdb最小文件块时长
--storage.tsdb.retention=15d # tsdb 保存的数据时长,默认90d
这一套参数解决了tsdb文件块mmap 内存不够的问题
2.3.2、更新启动服务
systemctl daemon-reload
systemctl enable prometheus.service --now
systemctl start prometheus.service
3、安装alertManager
3.1、配置alertManager
route:
group_by: ['alertname'] #根据标签进行分组,分级告警。这里只是分组,没有分级,没有联动抑制告警
group_wait: 10s #分组的告警窗口
group_interval: 10s #
repeat_interval: 2d #重复间隔,生产10分钟即可
receiver: 'web.hook' #配置告警方式
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5210/' #告警方式配置
3.2、添加服务
cat > /usr/lib/systemd/system/alertmanager.service << 'EOF'
[Unit]
Description=alertmanager
After=network.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/opt/prometheus-alertmanager/alertmanager --config.file=/opt/prometheus-alertmanager/alertmanager.yml
[Install]
WantedBy=multi-user.target
EOF
3.3、更新启动服务
systemctl daemon-reload
systemctl enable alertmanager.service --now
systemctl start alertmanager.service
4、告警方式服务说明
需要python3环境,安装参考linux安装miniconda3
这里使用了一个简单的python 脚本,依赖说明: https://github.com/keijack/python-simple-http-server
安装说明:https://pypi.org/project/simple-http-server
脚本内容,主要功能是拦截alertManager的请求,将请请求转换成通知中心接受方式。alterManger不支持调用脚本的方式
from simple_http_server import route, server
from simple_http_server import Request
import json
import requests
@route("/", method=["GET", "POST", "PUT"])
def index(req=Request()):
alert_url="http://xxxxxx.com/noticeSend"
headers = {"Content-Type":"application/json"}
request_data = json.loads(str(req.body,"utf-8"))
alerts_data=request_data["alerts"]
tmp_ip_arr = []
for item in alerts_data:
ip = item["labels"]["instance"].split(":")[0]
tmp_ip_arr.append(ip)
ip_str = ",".join(tmp_ip_arr)
content = request_data["alerts"][0]["annotations"]["summary"]
tmp_data={}
tmp_data["userIds"]="123123"
tmp_data["sms"]= {
"templateParamList": [ip_str,content]
}
print(json.dumps(tmp_data))
alert_req= requests.post(alert_url,json.dumps(tmp_data),headers={'Content-Type':'application/json'})
if alert_req.status_code == 200:
return {"msg": "alert success"}
else:
return {"msg": "alert failed"}
def main(*args):
server.start(port=5210)
if __name__ == "__main__":
main()
5、nodeExporter安装
5.1、解压nodeExport到指定位置
5.2、添加nodeExport服务
cat > /usr/lib/systemd/system/nodeExporter.service << 'EOF'
[Unit]
Description=nodeExporter
After=network.target
[Service]
User=prometheus
Group=prometheus
ExecStart=/opt/node_exporter/node_exporter
[Install]
WantedBy=multi-user.target
EOF
5.3、更新启动服务
systemctl daemon-reload
systemctl enable nodeExporter.service --now
systemctl start nodeExporter.service
6、告警规则配置
6.1、修改告警规则
根据prometheus server的配置修改。这里用的general.yml配置文件
这个文档比较全https://awesome-prometheus-alerts.grep.to/rules.html#host-and-hardware
demo
: 这是nodeExporter的主机告警规则,可以自行删减
groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: 'node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: 'sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Hostnetworkthroughput
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: 'sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100'
for: 5m
labels:
severity: warning
annotations:
summary: unusual network throughput out
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
expr: 'sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50'
for: 5m
labels:
severity: warning
annotations:
summary: unusual disk read rate
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: 'sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50'
for: 2m
labels:
severity: warning
annotations:
summary: unusual disk write rate
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: out of disk space
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: disk will fill
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: 'node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: out of inodes
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: 'node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0'
for: 2m
labels:
severity: warning
annotations:
summary: inodes will fill
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: unusual disk read latency
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0'
for: 20m
labels:
severity: warning
annotations:
summary: unusual disk write latency
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80'
for: 0m
labels:
severity: warning
annotations:
summary: high CPU load
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderUtilized
expr: '100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20'
for: 1w
labels:
severity: info
annotations:
summary: CPU is under utilized
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
annotations:
summary: CPU steal noisy neighbor
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait
description: "CPU iowait > 5%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching
expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching
description: "Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '(1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Host swap is filling up
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure
expr: 'node_md_disks{state="failed"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Host RAID disk failure
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: 'increase(node_vmstat_oom_kill[1m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OOM kill detected
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: 'increase(node_edac_correctable_errors_total[1m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: 'node_reboot_required > 0'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: node-exporter-down
expr: up == 0
for: 1m
labels:
severity: info
annotations:
summary: " {{ $labels.instance }} 宕机了"
description: "instance: {{ $labels.instance }} \n- job: {{ $labels.job }} 关机了, 时间已经1分钟了。"
value: "{{ $value }}"
instance: "{{ $labels.instance }}"
6.2、更新告警规则
这个有个前提是开启了 --web.enable-lifecycle 刷新更新参数,如果没有该参数必须要重启prometheus-server更新
curl -XPOST http://127.0.0.1:9090/-/reload
end,成功收到磁盘满的告警
附录: grafana安装https://www.jianshu.com/p/bd8293455c1d