1.prometheus安装地址:https://realinstall-package.oss-cn-beijing.aliyuncs.com/package/prometheus-2.21.0-rc.0.linux-amd64.tar.gz
2.pushgateway安装地址:https://realinstall-package.oss-cn-beijing.aliyuncs.com/package/pushgateway-0.4.0.linux-amd64.tar.gz
prometheus安装:
tar zxvf prometheus-2.21.0-rc.0.linux-amd64.tar.gz
cd prometheus
mv prometheus.yml prometheus.yml.buckup
下载prometheus.yml和规则(在promtheus的根目录)
wget https://raw.githubusercontent.com/milvus-io/docs/master/v0.10.2/assets/monitoring/prometheus.yml \ -O prometheus.yml
wget -P rules https://raw.githubusercontent.com/milvus-io/docs/master/v0.10.2/assets/monitoring/alert_rules.yml
pushgateway 镜像安装方式:
docker pull prom/pushgateway
docker run -d -p 9091:9091 prom/pushgateway
配置文件修改:prometheus.yml
global:
scrape_interval: 2s # Set the scrape interval every 2 seconds. The default is every 1 minute.
evaluation_interval: 2s # Evaluate rules every 2 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "alert_rules.yml" # add alerting rules
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any time series scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Allows ephemeral and batch jobs to expose their metrics to Prometheus
- job_name: 'pushgateway'
honor_labels: true
static_configs:
- targets: ['localhost:9091']
#其他工程(端口号58080是自己写的上报监控的接口)
- job_name: 'min01'
honor_labels: true
static_configs:
- targets: ['10.*.0.17:58080']
- job_name: 'min02'
honor_labels: true
static_configs:
- targets: ['10.*.0.18:58080']
启动方式:
[root@milvus01 prometheus]# pwd
/data/software/prometheus
./prometheus --config.file=prometheus.yml &
grafana安装:
chmod 777 /data/software/grafana/
docker run -d --name grafana -p 3000:3000 -v /data/software/grafana/:/var/lib/grafana grafana/grafana
docker cp grafana:/etc/grafana/grafana.ini /data/software/grafana-data/etc
docker kill grafanan
docker rm grafana
mkdir /data/software/grafana-data/
chmod 777 grafana-data/
docker run --user root -d --name grafana -p 3000:3000 -v /data/software/grafana-data/etc:/etc/grafana/ -v /data/software/grafana-data/grafana:/var/lib/grafana grafana/grafana
配置文件配置:(目前没有查到原因为什么不成功,发送失败,所以采用了下个方法)
[smtp]
enabled = true
host = smtp.exmail.qq.com:465
user =Grafana@*****.com
# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;"""
password =****123
;cert_file =
;key_file =
skip_verify = true
from_address = Grafana@*****.com
from_name = Grafana
配置alertmanager
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.exmail.qq.com:465' # smtp地址
smtp_from: 'grafana@******.com' # 谁发邮件
smtp_auth_username: 'grafana@*******.com' # 邮箱用户
smtp_auth_password: '******' # 邮箱密码
smtp_require_tls: false
route:
group_by: ['alertname']
group_wait: 10s #当一个新的报警分组被创建后,需要等待至少group_wait时间来初始化通知,这种方式可以确保您能有足够的时间为同一分组来获取多个警报,然后一起触发这个报警信息。
group_interval: 10s # 当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息。
repeat_interval: 1h # 如果一个报警信息已经发送成功了,等待'repeat_interval'时间来重新发送他们
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=cbe4*********9d-d17f3d60990f'
- name: 'mail'
email_configs:
- to: '*****@******.com'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
启动:
./alertmanager --config.file=alertmanager.yml &