keepalived 双机高可用完整详细方案


1. 工作流程图


2. 服务器环境


服务器 ip 配置
NMS Node01 Cpu:4core Ram:8Gb Storage:256Gb
NMS Node02 Cpu:4core Ram:8Gb Storage:256Gb
Mariadb VIP
Mariadb Node01 Cpu:4core Ram:8Gb Storage:512Gb
Mariadb Node02 Cpu:4core Ram:8Gb Storage:512Gb


3. 高可用策略

3.1 NMS Node

3.1.1 简述

在NMS Node01和NMS Node02上分别部署NMS应用,两个节点之间通过keepalived进行管理并向外暴露虚拟ip(进行程序访问。

3.1.2 keepalived策略

目前NMS Node提供以下两种故障转移策略,具体可参考客户意见



3.2 Mariadb Node

3.2.1 简述

Mariadb Node01和MariadbNode02上分别部署Mariadb数据库服务,两台服务器间采用Mariadb自带主从机制进行双向主从配置实现数据库同步目的,通过keepalived管理虚拟ip(并向NMS Node节点提供数据库服务。

3.2.2 keepalived策略


4. 文件同步

4.1 简述


4.2 解决方案





Ubuntu安装keepalived:sudo apt-get install keepalived

SuseLinux安装keepalived:sudo zypper in keepalived


1. Mariadb Node

1.1 keepalived配置



! Configuration File for keepalived

bal_defs {
router_id db01

vrrp_instance VI_1 {
state BACKUP
interface eth0
virtual_router_id 45
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass 1111
virtual_ipaddress {
virtual_server 3306 {
delay_loop 6
persistence_timeout 50
protocol TCP
real_server 3306 { #根据当前实际宿主机IP进行配置
notify_down /etc/keepalived/
connect_timeout 3
nb_get_retry 3
connect_port 3306
delay_before_retry 3

1.2 shell脚本



#!/bin/bash
kill -9 $(cat /var/run/</pre>

1.3 keepalived启动


chmod 777 /etc/keepalived/


systemctl restart keepalived

注:以上步骤需要在两台Mariadb Node上都做配置,对于keepalived.conf配置文件,属性real_server 3306需要根据当前实际宿主机IP进行配置,其他配置、脚本文件保存一致。

2. Mariadb主从

2.1 容器创建:


docker run --name madb01 --restart=always -p 3306:3306 -v $HOME/docker/volumes/madb01:/var/lib/mysql -v /etc/localtime:/etc/localtime -e MYSQL_ROOT_PASSWORD=root_pwd -e MYSQL_USER=nms9000 -v MYSQL_PASSWORD=nms9000_pwd -d mariadb:10.5.8

2.2 文件配置



docker cp madb01:/etc/mysql/mariadb.conf.d/50-server.cnf ./

2.修改拷贝出来的文件:vim 50-server.cnf ,加入以下内容

server-id = 96 #两台服务设置成不同值
log_bin = /var/log/mysql/mysql-bin.log
expire_logs_days = 10
max_binlog_size = 100M

slave_skip_errors = 1062
auto-increment-offset = 2
log_bin = /var/log/mysql/mysql-bin.log
expire_logs_days = 10

max_binlog_size = 100M




slave_skip_errors = 1062
auto-increment-offset = 2</pre>


docker cp 50-server.cnf madb01:/etc/mysql/mariadb.conf.d/


systemctl restart madb01

2.3 数据初始化



docker exec -i madb01 /usr/bin/mysql -uroot -p123456 < initDB.sql

2.4 主从连接


#1.进入容器
docker exec -it madb01 /bin/bash

mysql -uroot -proot_pwd
docker exec -it madb01 /bin/bash


mysql -uroot -proot_pwd</pre>

2.4.1 生成slave连接用户

Mariadb Node01(

#生成连接用户并授权
create user 'slave02'@'' identified by 's123456';
grant replication slave on . to 'slave02'@'';
flush privileges;

flush tables with read lock;
create user 'slave02'@'' identified by 's123456';
grant replication slave on . to 'slave02'@'';
flush privileges;


flush tables with read lock;</pre>

Mariadb Node02(

#生成连接用户并授权
create user 'slave01'@'' identified by 's123456';
grant replication slave on . to 'slave01'@'';
flush privileges;

flush tables with read lock;
create user 'slave01'@'' identified by 's123456';
grant replication slave on . to 'slave01'@'';
flush privileges;


flush tables with read lock;</pre>

2.4.2 主从连接建立



#1.查看master状态
show master status\G;

MariaDB [(none)]> show master status\G;
*************************** 1. row ***************************
File: mysql-bin.000001
Position: 328
Binlog_Do_DB: nms9000
1 row in set (0.000 sec)

ERROR: No query specified
show master status\G;


MariaDB [(none)]> show master status\G;
*************************** 1. row ***************************
File: mysql-bin.000001
Position: 328
Binlog_Do_DB: nms9000
1 row in set (0.000 sec)

ERROR: No query specified</pre>



#1.主从连接建立

start slave;


start slave;</pre>



#1.查看master状态
show master status\G;

MariaDB [(none)]> show master status\G;
*************************** 1. row ***************************
File: mysql-bin.000001
Position: 328
Binlog_Do_DB: nms9000
1 row in set (0.000 sec)

ERROR: No query specified
show master status\G;


MariaDB [(none)]> show master status\G;
*************************** 1. row ***************************
File: mysql-bin.000001
Position: 328
Binlog_Do_DB: nms9000
1 row in set (0.000 sec)

ERROR: No query specified</pre>



#1.主从连接建立

start slave;


start slave;</pre>

2.4.3 连接验证


unlock tables;

两台节点都需要进行验证,如下执行命令show slave status\G;返回结果中Slave_IO_Running和Slave_SQL_Running都为Yes即代表已经互联成功。

MariaDB [(none)]> show slave status\G;
*************************** 1. row ***************************
Slave_IO_State: Waiting for master to send event
Master_User: slave01
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: mysql-bin.000007
Read_Master_Log_Pos: 342
Relay_Log_File: mysql-relay-bin.000005
Relay_Log_Pos: 641
Relay_Master_Log_File: mysql-bin.000007
Slave_IO_Running: Yes
Slave_SQL_Running: Yes
Replicate_Do_DB: nms9000
*************************** 1. row ***************************
Slave_IO_State: Waiting for master to send event
Master_User: slave01
Master_Port: 3306
Connect_Retry: 60
Master_Log_File: mysql-bin.000007
Read_Master_Log_Pos: 342
Relay_Log_File: mysql-relay-bin.000005
Relay_Log_Pos: 641
Relay_Master_Log_File: mysql-bin.000007
Slave_IO_Running: Yes
Slave_SQL_Running: Yes
Replicate_Do_DB: nms9000</pre>

3. NMS Node


3.1 keepalived配置



! Configuration File for keepalived

global_defs {
router_id nms01

script_user root

vrrp_script check_run {
script "/etc/keepalived/"
interval 60
weight 10
user root
pass nms1234

vrrp_instance VI_1 {
state BACKUP
interface ens33 #根据自己的工作网卡决定
virtual_router_id 41
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass 1111

track_script {

virtual_ipaddress { #虚拟IP地址

global_defs {
router_id nms01


script_user root

vrrp_script check_run {
script "/etc/keepalived/"
interval 60

weight 10

user root

pass nms1234

vrrp_instance VI_1 {
state BACKUP
interface ens33 #根据自己的工作网卡决定
virtual_router_id 41
priority 110
advert_int 1
authentication {
auth_type PASS
auth_pass 1111

track_script {

virtual_ipaddress { #虚拟IP地址

3.2 shell脚本



#!/bin/bash

curPath=(readlink -f "(dirname "$0")")
echo $curPath
filePath=curPath/nmssvc.txt echofilePath
cat $filePath | while read line

pIDa=lsof -i :$line | grep -v "PID" | awk '{print $1}'
cID=docker ps | grep $line |awk -F' ' '{printf $1}'
if [ "cID" == "" ]; then echo "port[line ] : false"
kill -9 $(cat /var/run/
exit 1

exit 0

curPath=(readlink -f "(dirname "$0")")

echo $curPath

filePath=curPath/nmssvc.txt echofilePath

cat $filePath | while read line

pIDa=lsof -i :$line | grep -v "PID" | awk '{print $1}'

cID=docker ps | grep $line |awk -F' ' '{printf $1}'
if [ "cID" == "" ]; then echo "port[line ] : false"
kill -9 $(cat /var/run/
exit 1

exit 0</pre>


server

3.3 keepalived启动


chmod 744 /etc/keepalived/



systemctl restart keepalived

注:因为keepalived服务依赖于NMS程序,所以需要确保NMS程序已经成功启动,由于数据库单独维护,在上述Mariadb连接配置好之后,需要修改docker-compose.yml中数据库连接地址为Mariadb VIP(

注:以上步骤需要在两台NMS Node上都做配置,对于keepalived.conf配置文件,属性priority 110需要根据ip地址来决定,其他脚本文件、服务列表文件保存一致。

3.4 配置文件同步

3.4.1 syncdir.py脚本



#!/usr/bin/python3

from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
import time
import paramiko
import sys,os
from scp import SCPClient

server = ""
port = 22
sourceDir = "/home/nms/sunam/docker-volumes/malaysia/backup/database"
absSourceFiles = []
destDir = "/home/nms/sunam/docker-volumes/malaysia/backup/database"

def createSSHClient(server, port, user, password):
client = paramiko.SSHClient()
client.connect(server, port, user, password)
return client

def getLocalFiles():
for root,dirs,files in os.walk(sourceDir):
for file in files:
return sourceFiles

def getUpdateFiles():

def syncDir():
ssh = createSSHClient(server, port, user, password)
absSourceFiles = []
scp = SCPClient(ssh.get_transport())
localFiles = getLocalFiles()

ftp = ssh.open_sftp()
destFiles = ftp.listdir(destDir)
for file in destFiles:
if file in localFiles:

if len(localFiles)>0:
for file in localFiles:

for file in absSourceFiles:

def dojob():

ssh = createSSHClient(server, port, user, password)

scheduler = BlockingScheduler()

scheduler.add_job(syncDir, 'interval', seconds=30, id='test_job1',args=[ssh])
scheduler.add_job(syncDir, 'interval', seconds=60, id='test_job1')

if name=='main':
len_argv = len(sys.argv)
if len_argv == 1:
server = ""
user = "nms"
elif len_argv == 4:
server = sys.argv[1]
user = sys.argv[2]

sudo python3

from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
import time
import paramiko
import sys,os
from scp import SCPClient

server = ""
port = 22
sourceDir = "/home/nms/sunam/docker-volumes/malaysia/backup/database"
absSourceFiles = []
destDir = "/home/nms/sunam/docker-volumes/malaysia/backup/database"

def createSSHClient(server, port, user, password):
client = paramiko.SSHClient()
client.connect(server, port, user, password)
return client

def getLocalFiles():
for root,dirs,files in os.walk(sourceDir):
for file in files:
return sourceFiles

def getUpdateFiles():


def syncDir():
ssh = createSSHClient(server, port, user, password)
absSourceFiles = []
scp = SCPClient(ssh.get_transport())
localFiles = getLocalFiles()

ftp = ssh.open_sftp()
destFiles = ftp.listdir(destDir)
for file in destFiles:
if file in localFiles:

if len(localFiles)>0:
for file in localFiles:

for file in absSourceFiles:


def dojob():

ssh = createSSHClient(server, port, user, password)


scheduler = BlockingScheduler()


scheduler.add_job(syncDir, 'interval', seconds=30, id='test_job1',args=[ssh])

scheduler.add_job(syncDir, 'interval', seconds=60, id='test_job1')

if name=='main':
len_argv = len(sys.argv)
if len_argv == 1:
server = ""
user = "nms"
elif len_argv == 4:
server = sys.argv[1]
user = sys.argv[2]




sudo python3

注:执行 脚本需要python相关依赖环境,首次执行可根据报错提示进行依赖加载。

3.4.2 自启配置

将 脚本的执行作为linux服务加入到服务中,并配置为自启服务。


[Unit]Description=<short description>[Service]Type=simpleExecStart=/home/nms/sunam/deploy/malaysia/output/syncdir/ #脚本文件所在目录[Install]


#1.设置自启systemctl enable syncDir.service#2.以服务形式启动同步文件脚本systemctl start syncDir.service

