从redis的rehash谈低延时设计

Redis作为KV的缓存系统，它的数据存储是如何设计的？面临庞大的KV时，是如何做到低延时的？这篇我们从低延时的角度谈谈redis的实现，以下代码基于redis7.2版本。

一、redis的hash表
redis的db的基本构成是dict，定义如下：

typedef struct dictEntry {

    void *key;

    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;

    struct dictEntry *next; /* Next entry in the same hash bucket. */
    void *metadata[];       /* An arbitrary number of bytes (starting at a
                             * pointer-aligned address) of size as returned
                             * by dictType's dictEntryMetadataBytes(). */
} dictEntry;

typedef struct dictType {

    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(dict *d, const void *key);
    void *(*valDup)(dict *d, const void *obj);
    int (*keyCompare)(dict *d, const void *key1, const void *key2);
    void (*keyDestructor)(dict *d, void *key);
    void (*valDestructor)(dict *d, void *obj);
    int (*expandAllowed)(size_t moreMem, double usedRatio);

    /* Allow a dictEntry to carry extra caller-defined metadata. The
     * extra memory is initialized to 0 when a dictEntry is allocated. */
    size_t (*dictEntryMetadataBytes)(dict *d);
} dictType;

#define DICTHT_SIZE(exp) ((exp) == -1 ? 0 : (unsigned long)1<<(exp))
#define DICTHT_SIZE_MASK(exp) ((exp) == -1 ? 0 : (DICTHT_SIZE(exp))-1)

struct dict {
    dictType *type;

    dictEntry **ht_table[2];
    unsigned long ht_used[2];

    long rehashidx; /* rehashing not in progress if rehashidx == -1 */

    /* Keep small vars at end for optimal (minimal) struct padding */
    int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
    signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */
};

ht_table[2]表示两个hash表，这个表的结构用图表示如下，ht_table[1]指向rehash的表，当rehash完了后，释放ht_table[0]，并将ht_table[0]指向ht_table[0];

Image.png

ht_used[2]：表示两个ht_table已经占有的空间；
rehashidx：rehash时，当前已经rehash到的ht_table位置；
pauserehash：rehash的迭代器，每rehash完一个hash表节点就步进1；
ht_size_exp[2]：两个hash表的空间大小（1<<exp，即hash表是以指数增长空间的）；

dictEntry的v可以是hset、list、zset等等其他任意结构，有些地方也把每一个hash表的节点叫做桶（bucket），也就是用一个链表来解决hash冲突。

对于dict结构，首先定义个type字段，type表示ht_table使用的hash算法，关于redis的hash算法在不同版本有不同实现，具体描述如下：
1. Redis2.8和3.0 版本：这两个版本的Redis使用的是MurmurHash2算法。MurmurHash是一种非加密哈希算法，它具有高性能和良好的哈希分布特性。MurmurHash2是MurmurHash的改进版本，它在速度和分布质量方面有所提升。

Redis3.2版本：在Redis3.2版本中，也使用了MurmurHash2算法。然而，这个版本对MurmurHash2进行了一些调整，使其在处理字符串类型的键时具有更好的性能。
Redis4.0和5.0版本：从Redis4.0开始，哈希算法被替换为SipHash。SipHash是一种加密哈希算法，它的设计目的是在速度和安全性之间达到一个平衡。相比MurmurHash2，SipHash更能抵抗哈希洪范攻击，从而提高了Redis的安全性。
Redis6.0 及以后的版本：在Redis6.0及后续版本中，依然使用SipHash作为哈希算法。

Redis在不同版本中使用了两种哈希算法，分别是MurmurHash2和SipHash。从Redis4.0开始，SipHash成为了默认的哈希算法。具体实现代码如下：

NO_SANITIZE("alignment")
uint64_t siphash(const uint8_t *in, const size_t inlen, const uint8_t *k) {
#ifndef UNALIGNED_LE_CPU
    uint64_t hash;
    uint8_t *out = (uint8_t*) &hash;
#endif
    uint64_t v0 = 0x736f6d6570736575ULL;
    uint64_t v1 = 0x646f72616e646f6dULL;
    uint64_t v2 = 0x6c7967656e657261ULL;
    uint64_t v3 = 0x7465646279746573ULL;
    uint64_t k0 = U8TO64_LE(k);
    uint64_t k1 = U8TO64_LE(k + 8);
    uint64_t m;
    const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
    const int left = inlen & 7;
    uint64_t b = ((uint64_t)inlen) << 56;
    v3 ^= k1;
    v2 ^= k0;
    v1 ^= k1;
    v0 ^= k0;

    for (; in != end; in += 8) {
        m = U8TO64_LE(in);
        v3 ^= m;

        SIPROUND;

        v0 ^= m;
    }

    switch (left) {
    case 7: b |= ((uint64_t)in[6]) << 48; /* fall-thru */
    case 6: b |= ((uint64_t)in[5]) << 40; /* fall-thru */
    case 5: b |= ((uint64_t)in[4]) << 32; /* fall-thru */
    case 4: b |= ((uint64_t)in[3]) << 24; /* fall-thru */
    case 3: b |= ((uint64_t)in[2]) << 16; /* fall-thru */
    case 2: b |= ((uint64_t)in[1]) << 8; /* fall-thru */
    case 1: b |= ((uint64_t)in[0]); break;
    case 0: break;
    }

    v3 ^= b;

    SIPROUND;

    v0 ^= b;

    v2 ^= 0xff;

    SIPROUND;
    SIPROUND;

    b = v0 ^ v1 ^ v2 ^ v3;

#ifndef UNALIGNED_LE_CPU
    U64TO8_LE(out, b);
    return hash;
#else
    return b;
#endif
}


NO_SANITIZE("alignment")
uint64_t siphash_nocase(const uint8_t *in, const size_t inlen, const uint8_t *k) {
#ifndef UNALIGNED_LE_CPU
    uint64_t hash;
    uint8_t *out = (uint8_t*) &hash;
#endif
    uint64_t v0 = 0x736f6d6570736575ULL;
    uint64_t v1 = 0x646f72616e646f6dULL;
    uint64_t v2 = 0x6c7967656e657261ULL;
    uint64_t v3 = 0x7465646279746573ULL;
    uint64_t k0 = U8TO64_LE(k);
    uint64_t k1 = U8TO64_LE(k + 8);
    uint64_t m;
    const uint8_t *end = in + inlen - (inlen % sizeof(uint64_t));
    const int left = inlen & 7;
    uint64_t b = ((uint64_t)inlen) << 56;
    v3 ^= k1;
    v2 ^= k0;
    v1 ^= k1;
    v0 ^= k0;

    for (; in != end; in += 8) {
        m = U8TO64_LE_NOCASE(in);
        v3 ^= m;

        SIPROUND;

        v0 ^= m;
    }

    switch (left) {
    case 7: b |= ((uint64_t)siptlw(in[6])) << 48; /* fall-thru */
    case 6: b |= ((uint64_t)siptlw(in[5])) << 40; /* fall-thru */
    case 5: b |= ((uint64_t)siptlw(in[4])) << 32; /* fall-thru */
    case 4: b |= ((uint64_t)siptlw(in[3])) << 24; /* fall-thru */
    case 3: b |= ((uint64_t)siptlw(in[2])) << 16; /* fall-thru */
    case 2: b |= ((uint64_t)siptlw(in[1])) << 8; /* fall-thru */
    case 1: b |= ((uint64_t)siptlw(in[0])); break;
    case 0: break;
    }

    v3 ^= b;

    SIPROUND;

    v0 ^= b;
    v2 ^= 0xff;

    SIPROUND;
    SIPROUND;

    b = v0 ^ v1 ^ v2 ^ v3;

#ifndef UNALIGNED_LE_CPU
    U64TO8_LE(out, b);
    return hash;
#else
    return b;
#endif
}

[2023.08.16更新]2016 新出的 HighwayHash，它宣称可以达到 SipHash 一样的效果，并且凭借 SIMD 的加持，在运算速度上它是 SipHash 的5.2倍（见论文https://arxiv.org/abs/1612.06257）

二、redis的rehash算法
什么情况下进行rehash，分扩容和缩容两种情况：
redis的负载因子：
loadfactor = ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]);

扩容情况：

未创建子进程去进行bgsave或者bgrewriteaof时，如果loadfactor 大于1，则进行扩容；
[2023.08.16更新]专业的测试给出的loadfactor设置为0.75时，性能是较佳的，超过0.75时性能会下降。
创建了子进程去进行bgsave或者bgrewriteaof时，如果loadfactor 大于5，则进行扩容，这是为了尽量让子进程处理完持久化的工作；

/* Using dictEnableResize() / dictDisableResize() we make possible to
 * enable/disable resizing of the hash table as needed. This is very important
 * for Redis, as we use copy-on-write and don't want to move too much memory
 * around when there is a child performing saving operations.
 *
 * Note that even when dict_can_resize is set to 0, not all resizes are
 * prevented: a hash table is still allowed to grow if the ratio between
 * the number of elements and the buckets > dict_force_resize_ratio. */
static int dict_can_resize = 1;
static unsigned int dict_force_resize_ratio = 5;

/* Expand the hash table if needed */
static int _dictExpandIfNeeded(dict *d) {

    /* Incremental rehashing already in progress. Return. */
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if (d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0]) &&
        (dict_can_resize ||
         d->ht_used[0]/ DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio) &&
         dictTypeExpandAllowed(d))
    {
        return dictExpand(d, d->ht_used[0] + 1);
    }

    return DICT_OK;
}

/* This function is called once a background process of some kind terminates,
 * as we want to avoid resizing the hash tables when there is a child in order
 * to play well with copy-on-write (otherwise when a resize happens lots of
 * memory pages are copied). The goal of this function is to update the ability
 * for dict.c to resize the hash tables accordingly to the fact we have an
 * active fork child running. */
void updateDictResizePolicy(void) {
    if (!hasActiveChildProcess())
        dictEnableResize();
    else
        dictDisableResize();
}

void dictEnableResize(void) {
    dict_can_resize = 1;
}

缩容情况，当已占用的节点数少于空间大小的10%时触发：

/* Hash table parameters */
#define HASHTABLE_MIN_FILL 10 /* Minimal hash table fill 10% */

int htNeedsResize(dict *dict) {
    long long size, used;

    size = dictSlots(dict);
    used = dictSize(dict);

    return (size > DICT_HT_INITIAL_SIZE &&
        (used*100/size < HASHTABLE_MIN_FILL));
}

/* If the percentage of used slots in the HT reaches HASHTABLE_MIN_FILL
 * we resize the hash table to save memory */
void tryResizeHashTables(int dbid) {
    if (htNeedsResize(server.db[dbid].dict))
        dictResize(server.db[dbid].dict);
    if (htNeedsResize(server.db[dbid].expires))
        dictResize(server.db[dbid].expires);
}

上文解释了触发rehash的时机，那redis是如何进行rehash降低系统的整个延时？

当系统空闲的时候，才会做一些background的工作，比如rehash、过期key处理、resizing等等，进行rehash时不是一次性干完，而是每个执行周期仅仅干1毫秒。那么如果hash表特别大，比如几千万个key时，那可能要阻塞很久，但是如果是分步执行rehash，那么就不会阻塞正常的io请求。

/* Our hash table implementation performs rehashing incrementally while
 * we write/read from the hash table. Still if the server is idle, the hash
 * table will use two tables for a long time. So we try to use 1 millisecond
 * of CPU time at every call of this function to perform some rehashing.
 *
 * The function returns 1 if some rehashing was performed, otherwise 0
 * is returned. */
int incrementallyRehash(int dbid) {

    /* Keys dictionary */
    if (dictIsRehashing(server.db[dbid].dict)) {
        dictRehashMilliseconds(server.db[dbid].dict,1);
        return 1; /* already used our millisecond for this loop... */
     }

     /* Expires */
     if (dictIsRehashing(server.db[dbid].expires)) {
         dictRehashMilliseconds(server.db[dbid].expires,1);
         return 1; /* already used our millisecond for this loop... */
     }

     return 0;
}

/* Rehash in ms+"delta" milliseconds. The value of "delta" is larger
* than 0, and is smaller than 1 in most cases. The exact upper bound
* depends on the running time of dictRehash(d,100).*/
int dictRehashMilliseconds(dict *d, int ms) {
    if (d->pauserehash > 0) return 0;

    long long start = timeInMilliseconds();
    int rehashes = 0;

    while(dictRehash(d,100)) {
        rehashes += 100;
        if (timeInMilliseconds()-start > ms) break;
    }

    return rehashes;
}

三、低延时的设计
在redis中有很多设计值得我们借鉴，比如这种低延时的设计，这里还包含：

利用空闲时间进行key的过期处理，也不是一蹴而就，而是每次执行一个很少的时间片；
持久化时创建一个子进程，利用进程的copy-on-write机制，将持久化等耗时的操作放在子进程处理这样就不会当前服务进程去响应用户的IO请求；

从redis的rehash谈低延时设计

推荐阅读更多精彩内容