Redis源码研究之哨兵Sentinel

本文主要说明Redis中哨兵Sentinel的设计与实现。

建议阅读：
1、Sentinel的理论部分见：Redis之Sentinel

I、上帝视角

1、Sentinel也是Redis服务器，只是与普通服务器职责不同，其负责监视Redis服务器，以提高服务器集群的可靠性。Sentinel与普通服务器共用一套框架（网络框架，底层数据结构，订阅与发布机制），但又有其独立的运行代码。

为维护Sentinel系统的正常运行，我们先来看Redis为Sentinel维护了怎样的数据结构：

/* Main state. */
/* Sentinel 的状态结构 */
/*src/sentinel.csentinelState*/
struct sentinelState {

    // 当前纪元
    uint64_t current_epoch;     /* Current epoch. */

    // 保存了所有被这个 sentinel 监视的主服务器
    // 字典的key是主服务器的名字
    // 字典的value则是一个指向 sentinelRedisInstance 结构的指针
    dict *masters;      /* Dictionary of master sentinelRedisInstances.
                           Key is the instance name, value is the
                           sentinelRedisInstance structure pointer. */

    // 是否进入了 TILT 模式？
    int tilt;           /* Are we in TILT mode? */

    // 目前正在执行的脚本的数量
    int running_scripts;    /* Number of scripts in execution right now. */

    // 进入 TILT 模式的时间
    mstime_t tilt_start_time;   /* When TITL started. */

    // 最后一次执行时间处理器的时间
    mstime_t previous_time;     /* Last time we ran the time handler. */

    // 一个 FIFO 队列，包含了所有需要执行的用户脚本
    list *scripts_queue;    /* Queue of user scripts to execute. */

} sentinel;

2、从主函数main中可以看到服务器是如何向Sentinel转化的：

/*src/redis.c/main*/
int main(int argc, char **argv) {
    
    // 随机种子，一般rand() 产生随机数的函数会用到
    srand(time(NULL)^getpid());
    gettimeofday(&tv,NULL);
    dictSetHashFunctionSeed(tv.tv_sec^tv.tv_usec^getpid());
    // 通过命令行参数确认是否启动哨兵模式
    server.sentinel_mode = checkForSentinelMode(argc,argv);
    // 初始化服务器配置，主要是填充redisServer 结构体中的各种参数
    initServerConfig();
    // 将服务器配置为哨兵模式，与普通的redis 服务器不同
    /* We need to init sentinel right now as parsing the configuration file
    * in sentinel mode will have the effect of populating the sentinel
    * data structures with master nodes to monitor. */
    if (server.sentinel_mode) {
        // initSentinelConfig() 只指定哨兵服务器的端口
        initSentinelConfig();
        initSentinel();
    }
    ......
    // 普通redis 服务器模式
    if (!server.sentinel_mode) {
    ......
    // 哨兵服务器模式
    } else {
    // 检测哨兵模式是否正常配置
    sentinelIsRunning();
    }
    ......
    // 进入事件循环
    aeMain(server.el);
    // 去除事件循环系统
    aeDeleteEventLoop(server.el);
    return 0;
}

II、Sentinel的初始化

1、在上面的程序中，可以看出，如果检查到需要使用Sentinel模式时，会调用initSentinel函数对Sentinel服务器进行特有的初始化：

/* Perform the Sentinel mode initialization. */
// 以 Sentinel 模式初始化服务器
/*src/sentinel.c/initSentinel*/
void initSentinel(void) {
    int j;

    /* Remove usual Redis commands from the command table, then just add
     * the SENTINEL command. */

    // 清空 Redis 服务器的命令表（该表用于普通模式）
    dictEmpty(server.commands,NULL);
    // 将 SENTINEL 模式所用的命令添加进命令表
    for (j = 0; j < sizeof(sentinelcmds)/sizeof(sentinelcmds[0]); j++) {
        int retval;
        struct redisCommand *cmd = sentinelcmds+j;

        retval = dictAdd(server.commands, sdsnew(cmd->name), cmd);
        redisAssert(retval == DICT_OK);
    }

    /* Initialize various data structures. */
    /* 初始化 Sentinel 的状态 */
    // 初始化纪元
    sentinel.current_epoch = 0;

    // 初始化保存主服务器信息的字典
    sentinel.masters = dictCreate(&instancesDictType,NULL);

    // 初始化 TILT 模式的相关选项
    sentinel.tilt = 0;
    sentinel.tilt_start_time = 0;
    sentinel.previous_time = mstime();

    // 初始化脚本相关选项
    sentinel.running_scripts = 0;
    sentinel.scripts_queue = listCreate();
}

2、为了能让Sentinel自动管理Redis服务器，在serverCorn函数中添加了一个定时程序：

/*src/redis.c/serverCorn*/
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ......
    run_with_period(100) {
        //sentinelTimer即为sentinel主函数
         if (server.sentinel_mode) sentinelTimer();
    }
}

III、Sentinel主函数：sentinelTimer

sentinelTimer所做的工作包括：监视普通Redis服务器，执行故障转移，执行脚本命令。

// sentinel 模式的主函数，由 redis.c/serverCron 函数调用
/*src/sentinel.c/sentinelTimer*/
void sentinelTimer(void) {

    // 记录本次 sentinel 调用的事件，
    // 并判断是否需要进入 TITL 模式
    sentinelCheckTiltCondition();

    // 执行定期操作
    // 比如 PING 实例、分析主服务器和从服务器的 INFO 命令
    // 向其他监视相同主服务器的 sentinel 发送问候信息
    // 并接收其他 sentinel 发来的问候信息
    // 执行故障转移操作，等等
    sentinelHandleDictOfRedisInstances(sentinel.masters);

    // 运行等待执行的脚本
    sentinelRunPendingScripts();

    // 清理已执行完毕的脚本，并重试出错的脚本
    sentinelCollectTerminatedScripts();

    // 杀死运行超时的脚本
    sentinelKillTimedoutScripts();

    /* We continuously change the frequency of the Redis "timer interrupt"
     * in order to desynchronize every Sentinel from every other.
     * This non-determinism avoids that Sentinels started at the same time
     * exactly continue to stay synchronized asking to be voted at the
     * same time again and again (resulting in nobody likely winning the
     * election because of split brain voting). */
    server.hz = REDIS_DEFAULT_HZ + rand() % REDIS_DEFAULT_HZ;
}

IV、Sentinel与Redis服务器的连接

1、每个Sentinel都可以与多个Redis服务器连接，其为每个Redis服务器都维护了一个struct sentinelRedisInstance：

// Sentinel 会为每个被监视的 Redis 实例创建相应的 sentinelRedisInstance 实例
// （被监视的实例可以是主服务器、从服务器、或者其他 Sentinel ）
typedef struct sentinelRedisInstance {
    ......
    /* Master specific. */
    // 其他正在监视此主机的哨兵
    dict *sentinels; /* Other sentinels monitoring the same master. */
    // 次主机的从机列表
    dict *slaves; /* Slaves for this master instance. */
    ......
    // 如果是从机，master 则指向它的主机
    struct sentinelRedisInstance *master; /* Master instance if it's slave. */
    ......
} sentinelRedisInstance;

可见，Sentinel可监视的实例可以是主服务器，从服务器，或者其他Sentinel，下图表示了一个完整的sentinel.masters结构：

2、Sentinel要想对某个Redis服务器进行监视，则首先要做的就是先对Redis服务器进行连接，在连接之前需要完成配置工作（如IP，port）

假如需要对一个Redis服务器进行监视，则需要在配置文件中写入：
sentinel monitor <master-name> <ip> <redis-port> <quorum>

上述命令中quorum参数是Sentinel用来判断Redis服务器是否下线的参数，对以上命令的解析与配置是通过调用函数sentinelHandleConfiguration完成的：

// 哨兵配置文件解析和处理
/*src/sentinel.c/sentinelHandleConfiguration*/
char *sentinelHandleConfiguration(char **argv, int argc) {
    sentinelRedisInstance *ri;
    if (!strcasecmp(argv[0],"monitor") && argc == 5) {
        /* monitor <name> <host> <port> <quorum> */
        int quorum = atoi(argv[4]);
        // quorum >= 0
    if (quorum <= 0) return "Quorum must be 1 or greater.";
    if (createSentinelRedisInstance(argv[1],SRI_MASTER,argv[2],
        atoi(argv[3]),quorum,NULL) == NULL)
    {
    switch(errno) {
        case EBUSY: return "Duplicated master name.";
        case ENOENT: return "Can't resolve master instance hostname.";
        case EINVAL: return "Invalid port number";
        }
    }
    ......
}

sentinelHandleConfiguration主要调用了createSentinelRedisInstance函数，这个函数的工作就是初始化sentinelRedisInstance结构体。

/* ========================== sentinelRedisInstance ========================= */

/* Create a redis instance, the following fields must be populated by the
 * caller if needed:
 *
 * 创建一个 Redis 实例，在有需要时，以下两个域需要从调用者提取：
 *
 * runid: set to NULL but will be populated once INFO output is received.
 *        设置为 NULL ，并在接收到 INFO 命令的回复时设置
 *
 * info_refresh: is set to 0 to mean that we never received INFO so far.
 *               如果这个值为 0 ，那么表示我们未收到过 INFO 信息。
 *
 * If SRI_MASTER is set into initial flags the instance is added to
 * sentinel.masters table.
 *
 * 如果 flags 参数为 SRI_MASTER ，
 * 那么这个实例会被添加到 sentinel.masters 表。
 *
 * if SRI_SLAVE or SRI_SENTINEL is set then 'master' must be not NULL and the
 * instance is added into master->slaves or master->sentinels table.
 *
 * 如果 flags 为 SRI_SLAVE 或者 SRI_SENTINEL ，
 * 那么 master 参数不能为 NULL ，
 * SRI_SLAVE 类型的实例会被添加到 master->slaves 表中，
 * 而 SRI_SENTINEL 类型的实例则会被添加到 master->sentinels 表中。
 *
 * If the instance is a slave or sentinel, the name parameter is ignored and
 * is created automatically as hostname:port.
 *
 * 如果实例是从服务器或者 sentinel ，那么 name 参数会被自动忽略，
 * 实例的名字会被自动设置为 hostname:port 。
 *
 * The function fails if hostname can't be resolved or port is out of range.
 * When this happens NULL is returned and errno is set accordingly to the
 * createSentinelAddr() function.
 *
 * 当 hostname 不能被解释，或者超出范围时，函数将失败。
 * 函数将返回 NULL ，并设置 errno 变量，
 * 具体的出错值请参考 createSentinelAddr() 函数。
 *
 * The function may also fail and return NULL with errno set to EBUSY if
 * a master or slave with the same name already exists. 
 *
 * 当相同名字的主服务器或者从服务器已经存在时，函数返回 NULL ，
 * 并将 errno 设为 EBUSY 。
 */
sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *hostname, int port, int quorum, sentinelRedisInstance *master) {
    sentinelRedisInstance *ri;
    sentinelAddr *addr;
    dict *table = NULL;
    char slavename[128], *sdsname;

    redisAssert(flags & (SRI_MASTER|SRI_SLAVE|SRI_SENTINEL));
    redisAssert((flags & SRI_MASTER) || master != NULL);

    /* Check address validity. */
    // 保存 IP 地址和端口号到 addr
    addr = createSentinelAddr(hostname,port);
    if (addr == NULL) return NULL;

    /* For slaves and sentinel we use ip:port as name. */
    // 如果实例是从服务器或者 sentinel ，那么使用 ip:port 格式为实例设置名字
    if (flags & (SRI_SLAVE|SRI_SENTINEL)) {
        snprintf(slavename,sizeof(slavename),
            strchr(hostname,':') ? "[%s]:%d" : "%s:%d",
            hostname,port);
        name = slavename;
    }

    /* Make sure the entry is not duplicated. This may happen when the same
     * name for a master is used multiple times inside the configuration or
     * if we try to add multiple times a slave or sentinel with same ip/port
     * to a master. */
    // 配置文件中添加了重复的主服务器配置
    // 或者尝试添加一个相同 ip 或者端口号的从服务器或者 sentinel 时
    // 就可能出现重复添加同一个实例的情况
    // 为了避免这种现象，程序在添加新实例之前，需要先检查实例是否已存在
    // 只有不存在的实例会被添加

    // 选择要添加的表
    // 注意主服务会被添加到 sentinel.masters 表
    // 而从服务器和 sentinel 则会被添加到 master 所属的 slaves 表和 sentinels 表中
    if (flags & SRI_MASTER) table = sentinel.masters;
    else if (flags & SRI_SLAVE) table = master->slaves;
    else if (flags & SRI_SENTINEL) table = master->sentinels;
    sdsname = sdsnew(name);
    if (dictFind(table,sdsname)) {

        // 实例已存在，函数直接返回

        sdsfree(sdsname);
        errno = EBUSY;
        return NULL;
    }

    /* Create the instance object. */
    // 创建实例对象
    ri = zmalloc(sizeof(*ri));
    /* Note that all the instances are started in the disconnected state,
     * the event loop will take care of connecting them. */
    // 所有连接都已断线为起始状态，sentinel 会在需要时自动为它创建连接
    ri->flags = flags | SRI_DISCONNECTED;
    ri->name = sdsname;
    ri->runid = NULL;
    ri->config_epoch = 0;
    ri->addr = addr;
    ri->cc = NULL;
    ri->pc = NULL;
    ri->pending_commands = 0;
    ri->cc_conn_time = 0;
    ri->pc_conn_time = 0;
    ri->pc_last_activity = 0;
    /* We set the last_ping_time to "now" even if we actually don't have yet
     * a connection with the node, nor we sent a ping.
     * This is useful to detect a timeout in case we'll not be able to connect
     * with the node at all. */
    ri->last_ping_time = mstime();
    ri->last_avail_time = mstime();
    ri->last_pong_time = mstime();
    ri->last_pub_time = mstime();
    ri->last_hello_time = mstime();
    ri->last_master_down_reply_time = mstime();
    ri->s_down_since_time = 0;
    ri->o_down_since_time = 0;
    ri->down_after_period = master ? master->down_after_period :
                            SENTINEL_DEFAULT_DOWN_AFTER;
    ri->master_link_down_time = 0;
    ri->auth_pass = NULL;
    ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY;
    ri->slave_reconf_sent_time = 0;
    ri->slave_master_host = NULL;
    ri->slave_master_port = 0;
    ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN;
    ri->slave_repl_offset = 0;
    ri->sentinels = dictCreate(&instancesDictType,NULL);
    ri->quorum = quorum;
    ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS;
    ri->master = master;
    ri->slaves = dictCreate(&instancesDictType,NULL);
    ri->info_refresh = 0;

    /* Failover state. */
    ri->leader = NULL;
    ri->leader_epoch = 0;
    ri->failover_epoch = 0;
    ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
    ri->failover_state_change_time = 0;
    ri->failover_start_time = 0;
    ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT;
    ri->failover_delay_logged = 0;
    ri->promoted_slave = NULL;
    ri->notification_script = NULL;
    ri->client_reconfig_script = NULL;

    /* Role */
    ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE);
    ri->role_reported_time = mstime();
    ri->slave_conf_change_time = mstime();

    /* Add into the right table. */
    // 将实例添加到适当的表中
    dictAdd(table, ri->name, ri);

    // 返回实例
    return ri;
}

3、在这里Sentinel并没有马上去连接Redis服务器，而只是将sentinelRedisInstance.flag状态标记为了SRI_DISCONNECT，真正的连接工作其实在定时程序中，因为无论是主从服务器之间的连接，还是Sentinel与Redis服务器之间的连接，要想保持其连接状态，就需要定期检查，所以就直接将连接放到了定时程序中统一处理。

调用过程如下：
sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->sentinelReconnectInstance()

sentinelReconnectInstance()函数的作用就是连接标记为SRI_DISCONNECT的服务器，其对Redis发起了两种连接：
· 普通连接：用于向主服务器发布Sentinel的命令，并接收回复（这里Sentinel是主服务器的客户端）。
· 订阅与发布专用连接：用于订阅主服务器的__sentinel__:hello频道。这是因为Redis的发布与订阅功能中，被发布的信息不会保存在Redis服务器里面，因此，为了不丢失__sentinel__:hello频道的任何信息，Sentinel专门用一个连接来接收。

/* Create the async connections for the specified instance if the instance
 * is disconnected. Note that the SRI_DISCONNECTED flag is set even if just
 * one of the two links (commands and pub/sub) is missing. */
// 如果 sentinel 与实例处于断线（未连接）状态，那么创建连向实例的异步连接。
/*src/sentinel.c/sentinelReconnectInstance*/
void sentinelReconnectInstance(sentinelRedisInstance *ri) {

    // 示例未断线（已连接），返回
    if (!(ri->flags & SRI_DISCONNECTED)) return;

    /* Commands connection. */
    // 对所有实例创建一个用于发送 Redis 命令的连接, 包括主服务器，从服务器，和其他Sentinel
    if (ri->cc == NULL) {

        // 连接实例
        ri->cc = redisAsyncConnect(ri->addr->ip,ri->addr->port);

        // 连接出错
        if (ri->cc->err) {
            sentinelEvent(REDIS_DEBUG,"-cmd-link-reconnection",ri,"%@ #%s",
                ri->cc->errstr);
            sentinelKillLink(ri,ri->cc);

        // 连接成功
        } else {
            // 设置连接属性
            ri->cc_conn_time = mstime();
            ri->cc->data = ri;
            redisAeAttach(server.el,ri->cc);
            // 设置连线 callback
            redisAsyncSetConnectCallback(ri->cc,
                                            sentinelLinkEstablishedCallback);
            // 设置断线 callback
            redisAsyncSetDisconnectCallback(ri->cc,
                                            sentinelDisconnectCallback);
            // 发送 AUTH 命令，验证身份
            sentinelSendAuthIfNeeded(ri,ri->cc);
            sentinelSetClientName(ri,ri->cc,"cmd");

            /* Send a PING ASAP when reconnecting. */
            sentinelSendPing(ri);
        }
    }

    /* Pub / Sub */
    // 对主服务器和从服务器，创建一个用于订阅频道的连接
    if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && ri->pc == NULL) {

        // 连接实例
        ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port);

        // 连接出错
        if (ri->pc->err) {
            sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s",
                ri->pc->errstr);
            sentinelKillLink(ri,ri->pc);

        // 连接成功
        } else {
            int retval;

            // 设置连接属性
            ri->pc_conn_time = mstime();
            ri->pc->data = ri;
            redisAeAttach(server.el,ri->pc);
            // 设置连接 callback
            redisAsyncSetConnectCallback(ri->pc,
                                            sentinelLinkEstablishedCallback);
            // 设置断线 callback
            redisAsyncSetDisconnectCallback(ri->pc,
                                            sentinelDisconnectCallback);
            // 发送 AUTH 命令，验证身份
            sentinelSendAuthIfNeeded(ri,ri->pc);

            // 为客户但设置名字 "pubsub"
            sentinelSetClientName(ri,ri->pc,"pubsub");

            /* Now we subscribe to the Sentinels "Hello" channel. */
            // 发送 SUBSCRIBE __sentinel__:hello 命令，订阅频道
            retval = redisAsyncCommand(ri->pc,
                sentinelReceiveHelloMessages, NULL, "SUBSCRIBE %s",
                    SENTINEL_HELLO_CHANNEL);
            
            // 订阅出错，断开连接
            if (retval != REDIS_OK) {
                /* If we can't subscribe, the Pub/Sub connection is useless
                 * and we can simply disconnect it and try again. */
                sentinelKillLink(ri,ri->pc);
                return;
            }
        }
    }

    /* Clear the DISCONNECTED flags only if we have both the connections
     * (or just the commands connection if this is a sentinel instance). */
    // 如果实例是主服务器或者从服务器，那么当 cc 和 pc 两个连接都创建成功时，关闭 DISCONNECTED 标识
    // 如果实例是 Sentinel ，那么当 cc 连接创建成功时，关闭 DISCONNECTED 标识
    if (ri->cc && (ri->flags & SRI_SENTINEL || ri->pc))
        ri->flags &= ~SRI_DISCONNECTED;
}

4、上述代码中可以看出，Sentinel对主从服务器需要维护两个连接，而对其他Sentinel只需要维护命令连接，这是因为订阅连接的作用其实是为了自动发现：
一个Sentinel可以通过分析接收到的订阅频道信息来获知其他Sentinel的存在，并通过发送频道信息来让其他Sentinel知道自己的存在（将信息发送给主从服务器，主从服务器发布信息，使得所有监视服务器的Sentinel获知信息），所以用户在使用Sentinel的时候不需要提供各个Sentinel的地址信息，监视同一个服务器的多个Sentinel可以自动发现对方，只需要维护一个命令连接进行通信就足够了。

V、HELLO

1、从上面的sentinelReconnectInstance中可以看出，Sentinel初始化订阅连接的时候进行了两个操作，易格斯想服务器发送了HELLO命令，二是注册了回调函数sentinelReceiveHelloMessages，这个函数的功能就是处理订阅频道的返回值，从而完成自动发现。

2、在定时程序中sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->SentinelSendPeriodicCommand()中，Sentinel会向服务器的hello频道发布数据，其中由sentinelSendHello函数实现：

/*src/sentinel.c/sentinelSendHello*/
/* Send an "Hello" message via Pub/Sub to the specified 'ri' Redis
 * instance in order to broadcast the current configuraiton for this
 * master, and to advertise the existence of this Sentinel at the same time.
 *
 * 向给定 ri 实例的频道发送信息，
 * 从而传播关于给定主服务器的配置，
 * 并向其他 Sentinel 宣告本 Sentinel 的存在。
 *
 * The message has the following format:
 *
 * 发送信息的格式如下： 
 *
 * sentinel_ip,sentinel_port,sentinel_runid,current_epoch,
 * master_name,master_ip,master_port,master_config_epoch.
 *
 * Sentinel IP,Sentinel 端口号,Sentinel 的运行 ID,Sentinel 当前的纪元,
 * 主服务器的名称,主服务器的 IP,主服务器的端口号,主服务器的配置纪元.
 *
 * Returns REDIS_OK if the PUBLISH was queued correctly, otherwise
 * REDIS_ERR is returned. 
 *
 * PUBLISH 命令成功入队时返回 REDIS_OK ，
 * 否则返回 REDIS_ERR 。
 */
int sentinelSendHello(sentinelRedisInstance *ri) {
    char ip[REDIS_IP_STR_LEN];
    char payload[REDIS_IP_STR_LEN+1024];
    int retval;

    // 如果实例是主服务器，那么使用此实例的信息
    // 如果实例是从服务器，那么使用这个从服务器的主服务器的信息
    sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master;

    // 获取地址信息
    sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master);

    /* Try to obtain our own IP address. */
    // 获取实例自身的地址
    if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR;
    if (ri->flags & SRI_DISCONNECTED) return REDIS_ERR;

    /* Format and send the Hello message. */
    // 格式化信息
    snprintf(payload,sizeof(payload),
        "%s,%d,%s,%llu," /* Info about this sentinel. */
        "%s,%s,%d,%llu", /* Info about current master. */
        ip, server.port, server.runid,
        (unsigned long long) sentinel.current_epoch,
        /* --- */
        master->name,master_addr->ip,master_addr->port,
        (unsigned long long) master->config_epoch);
    
    // 发送信息
    retval = redisAsyncCommand(ri->cc,
        sentinelPublishReplyCallback, NULL, "PUBLISH %s %s",
            SENTINEL_HELLO_CHANNEL,payload);

    if (retval != REDIS_OK) return REDIS_ERR;

    ri->pending_commands++;

    return REDIS_OK;
}

2、当Redis收到来自Sentinel的发布信息时，就会想所有订阅hello频道的Sentinel发布数据，于是刚才所注册的回调函数sentinelReceiveHelloMessage就被调用，其主要做了两方面的工作：

· 发现了其他监视此服务器的Sentinel；
· 更新配置信息；

VI、INFO

1、Sentinel会以十秒一次的频率首先向所监视的主机发送INFO命令：

其调用过程如下：
sentinelTimer()->sentinelHandleDictOfRedisInstances()->sentinelHandleRedisInstance()->sentinelSendPeriodicCommands()

这其中，Sentinel同样做了两件事，一个是发送了INFO命令，另一个是注册了sentinelInfoReplyCallback()回调函数。

当INFO命令返回时，收到了来自服务器的回复（包括主机的相关信息，以及主机所连接的从服务器），回调函数被调用，主要是完成对服务器回复信息的处理（这其中包括，主从复制信息，存储的键值对数量，Sentinel判断是否下线等），并根据获取到所的从服务器信息实现对从服务器的监视。这也是Sentinel自动发现的部分。

VII、心跳检测

1、心跳检测是判断两台机器是否连接正常的常用手段，接收方在收到心跳包之后，会更新收到心跳的时间，在某个事件点如果检测到心跳包多久没有收到（超时），则证明网络状况不好，或对方很忙，也为接下来的行动提供指导，如延迟所需要进行的后续操作，指导心跳检测正常。

VIII、在线状态监测

1、Sentinel根据主观判断与客观判断来完成在线状态监测：
主观下线：是根据Sentinel自己观测某个服务器的信息；
客观下线：是通过综合所有监测某服务器的Sentinel的信息；

这同样是通过心跳检测发送PING实现的。

2、主观下线判断

/*src/sentinel.c/sentinelCheckSubjectivelyDown*/
/* Is this instance down from our point of view? */
// 检查实例是否以下线（从本 Sentinel 的角度来看）
void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {

    mstime_t elapsed = 0;

    if (ri->last_ping_time)
        elapsed = mstime() - ri->last_ping_time;

    /* Check if we are in need for a reconnection of one of the 
     * links, because we are detecting low activity.
     *
     * 如果检测到连接的活跃度（activity）很低，那么考虑重断开连接，并进行重连
     *
     * 1) Check if the command link seems connected, was connected not less
     *    than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have a
     *    pending ping for more than half the timeout. */
    // 考虑断开实例的 cc 连接
    if (ri->cc &&
        (mstime() - ri->cc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
        ri->last_ping_time != 0 && /* Ther is a pending ping... */
        /* The pending ping is delayed, and we did not received
         * error replies as well. */
        (mstime() - ri->last_ping_time) > (ri->down_after_period/2) &&
        (mstime() - ri->last_pong_time) > (ri->down_after_period/2))
    {
        sentinelKillLink(ri,ri->cc);
    }

    /* 2) Check if the pubsub link seems connected, was connected not less
     *    than SENTINEL_MIN_LINK_RECONNECT_PERIOD, but still we have no
     *    activity in the Pub/Sub channel for more than
     *    SENTINEL_PUBLISH_PERIOD * 3.
     */
    // 考虑断开实例的 pc 连接
    if (ri->pc &&
        (mstime() - ri->pc_conn_time) > SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
        (mstime() - ri->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD*3))
    {
        sentinelKillLink(ri,ri->pc);
    }

    /* Update the SDOWN flag. We believe the instance is SDOWN if:
     *
     * 更新 SDOWN 标识。如果以下条件被满足，那么 Sentinel 认为实例已下线：
     *
     * 1) It is not replying.
     *    它没有回应命令
     * 2) We believe it is a master, it reports to be a slave for enough time
     *    to meet the down_after_period, plus enough time to get two times
     *    INFO report from the instance. 
     *    Sentinel 认为实例是主服务器，这个服务器向 Sentinel 报告它将成为从服务器，
     *    但在超过给定时限之后，服务器仍然没有完成这一角色转换。
     */
    if (elapsed > ri->down_after_period ||
        (ri->flags & SRI_MASTER &&
         ri->role_reported == SRI_SLAVE &&
         mstime() - ri->role_reported_time >
          (ri->down_after_period+SENTINEL_INFO_PERIOD*2)))
    {
        /* Is subjectively down */
        if ((ri->flags & SRI_S_DOWN) == 0) {
            // 发送事件
            sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@");
            // 记录进入 SDOWN 状态的时间
            ri->s_down_since_time = mstime();
            // 打开 SDOWN 标志
            ri->flags |= SRI_S_DOWN;
        }
    } else {
        // 移除（可能有的） SDOWN 状态
        /* Is subjectively up */
        if (ri->flags & SRI_S_DOWN) {
            // 发送事件
            sentinelEvent(REDIS_WARNING,"-sdown",ri,"%@");
            // 移除相关标志
            ri->flags &= ~(SRI_S_DOWN|SRI_SCRIPT_KILL_SENT);
        }
    }
}

3、客观下线判断

/*src/sentinel.c/sentinelCheckObjectiveDown*/
/* Is this instance down according to the configured quorum?
 *
 * 根据给定数量的 Sentinel 投票，判断实例是否已下线。
 *
 * Note that ODOWN is a weak quorum, it only means that enough Sentinels
 * reported in a given time range that the instance was not reachable.
 *
 * 注意 ODOWN 是一个 weak quorum ，它只意味着有足够多的 Sentinel 
 * 在**给定的时间范围内**报告实例不可达。
 *
 * However messages can be delayed so there are no strong guarantees about
 * N instances agreeing at the same time about the down state. 
 *
 * 因为 Sentinel 对实例的检测信息可能带有延迟，
 * 所以实际上 N 个 Sentinel **不可能在同一时间内**判断主服务器进入了下线状态。
 */
void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
    dictIterator *di;
    dictEntry *de;
    int quorum = 0, odown = 0;

    // 如果当前 Sentinel 将主服务器判断为主观下线
    // 那么检查是否有其他 Sentinel 同意这一判断
    // 当同意的数量足够时，将主服务器判断为客观下线
    if (master->flags & SRI_S_DOWN) {
        /* Is down for enough sentinels? */

        // 统计同意的 Sentinel 数量（起始的 1 代表本 Sentinel）
        quorum = 1; /* the current sentinel. */

        /* Count all the other sentinels. */
        // 统计其他认为 master 进入下线状态的 Sentinel 的数量
        di = dictGetIterator(master->sentinels);
        while((de = dictNext(di)) != NULL) {
            sentinelRedisInstance *ri = dictGetVal(de);
                
            // 该 SENTINEL 也认为 master 已下线
            if (ri->flags & SRI_MASTER_DOWN) quorum++;
        }
        dictReleaseIterator(di);
        
        // 如果投票得出的支持数目大于等于判断 ODOWN 所需的票数
        // 那么进入 ODOWN 状态
        if (quorum >= master->quorum) odown = 1;
    }

    /* Set the flag accordingly to the outcome. */
    if (odown) {

        // master 已 ODOWN

        if ((master->flags & SRI_O_DOWN) == 0) {
            // 发送事件
            sentinelEvent(REDIS_WARNING,"+odown",master,"%@ #quorum %d/%d",
                quorum, master->quorum);
            // 打开 ODOWN 标志
            master->flags |= SRI_O_DOWN;
            // 记录进入 ODOWN 的时间
            master->o_down_since_time = mstime();
        }
    } else {

        // 未进入 ODOWN

        if (master->flags & SRI_O_DOWN) {

            // 如果 master 曾经进入过 ODOWN 状态，那么移除该状态

            // 发送事件
            sentinelEvent(REDIS_WARNING,"-odown",master,"%@");
            // 移除 ODOWN 标志
            master->flags &= ~SRI_O_DOWN;
        }
    }
}

IX、故障修复

1、一般在Redis服务器集群中，只有主机同时肩负着读请求和写请求两个功能，而从机只负责读请求（从机的写是通过主从复制中主机的命令传播完成的）。所以当主机出现宕几是需要进行故障修复。

同样是来源于sentinelTimer()定时函数：

sentinelTimer()->sentinelHandleDictOfRedisInstance()->sentinelHandleRedisInstance()->sentinelStartFailoverIfNeeded() & sentinelFailoverStateMachine()

sentinelStartFailoverIfNeed()函数在判断主机主观下线之后，决定是否执行古装转移操作，sentinelFailoverStateMachine()函数开始执行故障转移操作：

/*src/sentinel.c/sentinelFailoverStateMachine*/
 // 故障修复状态机，依据被标记的状态执行相应的动作
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
    redisAssert(ri->flags & SRI_MASTER);
    if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
        switch(ri->failover_state) {
            case SENTINEL_FAILOVER_STATE_WAIT_START:
            sentinelFailoverWaitStart(ri);
            break;
            case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
            sentinelFailoverSelectSlave(ri);
            break;
            case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
            sentinelFailoverSendSlaveOfNoOne(ri);
            break;
            case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
            sentinelFailoverWaitPromotion(ri);
            break;
            case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
            sentinelFailoverReconfNextSlave(ri);
            break;
    }
}

上面的case是Sentinel故障转移中的六种状态：

sentinelFailoverStateMachine就是根据这些状态判断故障转移进行到了哪一步从而执行相应的函数，下面我们分别看着六个状态对应需要完成的工作是什么。

9.1 WAIT_START

1、当一个主服务器被判断为客观下线时，监视这个主服务器的各个Sentinel会进行协商，选举出一个领头Sentinel，并由领头Sentinel对主服务器进行故障转移操作。

此状态下调用函数sentinelFailoverWaitStart所进行的工作主要是判断自己是否为领头Sentinel：

// 准备执行故障转移
/*src/sentinel.c/sentinelFailoverWaitStart*/
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
    char *leader;
    int isleader;

    /* Check if we are the leader for the failover epoch. */
    // 获取给定纪元的领头 Sentinel
    leader = sentinelGetLeader(ri, ri->failover_epoch);
    // 本 Sentinel 是否为领头 Sentinel ？
    isleader = leader && strcasecmp(leader,server.runid) == 0;
    sdsfree(leader);

    /* If I'm not the leader, and it is not a forced failover via
     * SENTINEL FAILOVER, then I can't continue with the failover. */
    // 如果本 Sentinel 不是领头，并且这次故障迁移不是一次强制故障迁移操作
    // 那么本 Sentinel 不做动作
    if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) {
        int election_timeout = SENTINEL_ELECTION_TIMEOUT;

        /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT
         * and the configured failover timeout. */
        // 当选的时长（类似于任期）是 SENTINEL_ELECTION_TIMEOUT
        // 和 Sentinel 设置的故障迁移时长之间的较小那个值
        if (election_timeout > ri->failover_timeout)
            election_timeout = ri->failover_timeout;

        /* Abort the failover if I'm not the leader after some time. */
        // Sentinel 的当选时间已过，取消故障转移计划
        if (mstime() - ri->failover_start_time > election_timeout) {
            sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@");
            // 取消故障转移
            sentinelAbortFailover(ri);
        }
        return;
    }

    // 本 Sentinel 作为领头，开始执行故障迁移操作...

    sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@");

    // 进入选择从服务器状态
    ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
    ri->failover_state_change_time = mstime();

    sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
}

如果是领头Sentinel则将状态更新为SELECT_SLAVE。

9.2 SELECT_SLAVE

这个状态即为选取从服务器作为新的主服务器：

// 选择合适的从服务器作为新的主服务器
void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {

    // 在旧主服务器所属的从服务器中，选择新服务器
    sentinelRedisInstance *slave = sentinelSelectSlave(ri);

    /* We don't handle the timeout in this state as the function aborts
     * the failover or go forward in the next state. */
    // 没有合适的从服务器，直接终止故障转移操作
    if (slave == NULL) {

        // 没有可用的从服务器可以提升为新主服务器，故障转移操作无法执行
        sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@");

        // 中止故障转移
        sentinelAbortFailover(ri);

    } else {

        // 成功选定新主服务器

        // 发送事件
        sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@");

        // 打开实例的升级标记
        slave->flags |= SRI_PROMOTED;

        // 记录被选中的从服务器
        ri->promoted_slave = slave;

        // 更新故障转移状态
        ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;

        // 更新状态改变时间
        ri->failover_state_change_time = mstime();

        // 发送事件
        sentinelEvent(REDIS_NOTICE,"+failover-state-send-slaveof-noone",
            slave, "%@");
    }
}

此时状态更新为SLAVEOF_NOONE。

9.3 SLAVEOF_NOONE

此状态的工作是向选出来的新的主服务器发送SLAVEOF no one命令，使其成为真正的主服务器：

// 向被选中的从服务器发送 SLAVEOF no one 命令
// 将它升级为新的主服务器
/*src/sentinel.c/sentinelFailoverSendSlaveOfNoOne*/
void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
    int retval;

    /* We can't send the command to the promoted slave if it is now
     * disconnected. Retry again and again with this state until the timeout
     * is reached, then abort the failover. */
    // 如果选中的从服务器断线了，那么在给定的时间内重试
    // 如果给定时间内选中的从服务器也没有上线，那么终止故障迁移操作
    // （一般来说出现这种情况的机会很小，因为在选择新的主服务器时，
    // 已经断线的从服务器是不会被选中的，所以这种情况只会出现在
    // 从服务器被选中，并且发送 SLAVEOF NO ONE 命令之前的这段时间内）
    if (ri->promoted_slave->flags & SRI_DISCONNECTED) {

        // 如果超过时限，就不再重试
        if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
            sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
            sentinelAbortFailover(ri);
        }
        return;
    }

    /* Send SLAVEOF NO ONE command to turn the slave into a master.
     *
     * 向被升级的从服务器发送 SLAVEOF NO ONE 命令，将它变为一个主服务器。
     *
     * We actually register a generic callback for this command as we don't
     * really care about the reply. We check if it worked indirectly observing
     * if INFO returns a different role (master instead of slave). 
     *
     * 这里没有为命令回复关联一个回调函数，因为从服务器是否已经转变为主服务器可以
     * 通过向从服务器发送 INFO 命令来确认
     */
    retval = sentinelSendSlaveOf(ri->promoted_slave,NULL,0);
    if (retval != REDIS_OK) return;
    sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion",
        ri->promoted_slave,"%@");

    // 更新状态
    // 这个状态会让 Sentinel 等待被选中的从服务器升级为主服务器
    ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;

    // 更新状态改变的时间
    ri->failover_state_change_time = mstime();
}

9.4 WAIT_PROMOTION

负责检查时限，调用函数sentinelFailoverWaitPromotion只做了超时判断，如果超时则停止故障修复：

/* We actually wait for promotion indirectly checking with INFO when the
 * slave turns into a master. */
// Sentinel 会通过 INFO 命令的回复检查从服务器是否已经转变为主服务器
// 这里只负责检查时限
/*src/sentinel.c/sentinelFailoverWaitPromotion*/
void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
    /* Just handle the timeout. Switching to the next state is handled
     * by the function parsing the INFO command of the promoted slave. */
    if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
        sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
        sentinelAbortFailover(ri);
    }
}

9.5 RECONF_SLAVE

主要做的是向其他候选从服务器发送slaveof promote_slave，使其成为他们的主机：

/* Send SLAVE OF <new master address> to all the remaining slaves that
 * still don't appear to have the configuration updated. */
// 向所有尚未同步新主服务器的从服务器发送 SLAVEOF <new-master-address> 命令
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
    dictIterator *di;
    dictEntry *de;
    int in_progress = 0;

    // 计算正在同步新主服务器的从服务器数量
    di = dictGetIterator(master->slaves);
    while((de = dictNext(di)) != NULL) {
        sentinelRedisInstance *slave = dictGetVal(de);

        // SLAVEOF 命令已发送，或者同步正在进行
        if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))
            in_progress++;
    }
    dictReleaseIterator(di);

    // 如果正在同步的从服务器的数量少于 parallel-syncs 选项的值
    // 那么继续遍历从服务器，并让从服务器对新主服务器进行同步
    di = dictGetIterator(master->slaves);
    while(in_progress < master->parallel_syncs &&
          (de = dictNext(di)) != NULL)
    {
        sentinelRedisInstance *slave = dictGetVal(de);
        int retval;

        /* Skip the promoted slave, and already configured slaves. */
        // 跳过新主服务器，以及已经完成了同步的从服务器
        if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;

        /* If too much time elapsed without the slave moving forward to
         * the next state, consider it reconfigured even if it is not.
         * Sentinels will detect the slave as misconfigured and fix its
         * configuration later. */
        if ((slave->flags & SRI_RECONF_SENT) &&
            (mstime() - slave->slave_reconf_sent_time) >
            SENTINEL_SLAVE_RECONF_TIMEOUT)
        {
            // 发送重拾同步事件
            sentinelEvent(REDIS_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");
            // 清除已发送 SLAVEOF 命令的标记
            slave->flags &= ~SRI_RECONF_SENT;
            slave->flags |= SRI_RECONF_DONE;
        }

        /* Nothing to do for instances that are disconnected or already
         * in RECONF_SENT state. */
        // 如果已向从服务器发送 SLAVEOF 命令，或者同步正在进行
        // 又或者从服务器已断线，那么略过该服务器
        if (slave->flags & (SRI_DISCONNECTED|SRI_RECONF_SENT|SRI_RECONF_INPROG))
            continue;

        /* Send SLAVEOF <new master>. */
        // 向从服务器发送 SLAVEOF 命令，让它同步新主服务器
        retval = sentinelSendSlaveOf(slave,
                master->promoted_slave->addr->ip,
                master->promoted_slave->addr->port);
        if (retval == REDIS_OK) {

            // 将状态改为 SLAVEOF 命令已发送
            slave->flags |= SRI_RECONF_SENT;
            // 更新发送 SLAVEOF 命令的时间
            slave->slave_reconf_sent_time = mstime();
            sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@");
            // 增加当前正在同步的从服务器的数量
            in_progress++;
        }
    }
    dictReleaseIterator(di);

    /* Check if all the slaves are reconfigured and handle timeout. */
    // 判断是否所有从服务器的同步都已经完成
    sentinelFailoverDetectEnd(master);
}

9.6 UPDATE_CONFIG

故障转移结束后，将进入这一状态，会调用sentinelFailoverSwitchToPromotedSlave函数，将之前的下线master移除master表格，并由新的主服务器代替：

/* This function is called when the slave is in
 * SENTINEL_FAILOVER_STATE_UPDATE_CONFIG state. In this state we need
 * to remove it from the master table and add the promoted slave instead. */
// 这个函数在 master 已下线，并且对这个 master 的故障迁移操作已经完成时调用
// 这个 master 会被移除出 master 表格，并由新的主服务器代替
void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) {

    /// 选出要添加的 master
    sentinelRedisInstance *ref = master->promoted_slave ?
                                 master->promoted_slave : master;

    // 发送更新 master 事件
    sentinelEvent(REDIS_WARNING,"+switch-master",master,"%s %s %d %s %d",
        // 原 master 信息
        master->name, master->addr->ip, master->addr->port,
        // 新 master 信息
        ref->addr->ip, ref->addr->port);

    // 用新主服务器的信息代替原 master 的信息
    sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port);
}

至此，故障转移操作完成。

【参考】
[1] 《Redis设计与实现》
[2] 《Redis源码日志》

Redis源码研究之哨兵Sentinel