Redis源码阅读:字典dict的实现

代码版本:Branch 5.0
Github地址:戳我


字典主要数据结构

总体来说,Redis的字典使用哈希表作为底层实现,一个字典包含多个哈希表节点,哈希表节点中存放有键值对。具体的结构,自底层至顶层的定义如下

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
} dictht;

typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    long long fingerprint;
} dictIterator;
  • dictEntry实际上定义了一种用来存放键值对的数据结构,含有一个可以转换为任意类型指针的void指针用于存放key值,而v属性可以是一个指针,一个64位整形数或者一个双精度浮点数。next指针则指向另一个哈希表节点,当键值冲突时,将相同键值的节点构成一个链表即可。

  • 数据结构dictType里面存放了与特定字典相关的属性,即一些处理键值的函数指针

  • dictht数据结构定义了哈希表的结构

    • dictEntry **table : 指向一组哈希节点
    • size : 哈希表的大小
    • sizemask : 掩码,总是等于size-1,用于和哈希值一起计算,得出键的位置
    • used : 表示当前已经使用的数量
  • dict即字典结构
    • type : 指向一个dictType指针,存放处理键的函数
    • privdata : 被type中的函数所使用
    • ht[2] : 每个字典结构包含两个哈希表,用于rehashing,一个是新表一个是旧表
    • rehashidx : rehash过程中的索引,可以表征当前的进度,-1代表还没开始
    • iterators表示当前运行的迭代器的数量

创建字典以及哈希算法

创建字典

创建字典有关的几个函数如下所示,即给dict分配空间,传入dictType与privData参数,并依次初始化结构体中的数值。

/* Create a new hash table */
dict *dictCreate(dictType *type,
        void *privDataPtr)
{
    dict *d = zmalloc(sizeof(*d));

    _dictInit(d,type,privDataPtr);
    return d;
}

/* Initialize the hash table */
int _dictInit(dict *d, dictType *type,
        void *privDataPtr)
{
    _dictReset(&d->ht[0]);
    _dictReset(&d->ht[1]);
    d->type = type;
    d->privdata = privDataPtr;
    d->rehashidx = -1;
    d->iterators = 0;
    return DICT_OK;
}

/* Reset a hash table already initialized with ht_init().
 * NOTE: This function should only be called by ht_destroy(). */
static void _dictReset(dictht *ht)
{
    ht->table = NULL;
    ht->size = 0;
    ht->sizemask = 0;
    ht->used = 0;
}

哈希算法

在字典中添加一对键值对时,Redis根据key值生成了与之对应的哈希值,哈希值与dictht中的sizemask一起决定了键值对应当被存放的位置。值得注意的是,当前版本的Redis所采用的哈希算法为Siphash算法,相比MurmurHash2算法,性能更具优势,具体详见:Siphash Wiki,Redis中Siphash的实现在siphash.c文件中,这里只需要关心如何使用即可,故没有深入研究该算法的实现。

/* Add an element to the target hash table */
int dictAdd(dict *d, void *key, void *val)
{
    dictEntry *entry = dictAddRaw(d,key,NULL);

    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}

/* Low level add or find:
 * This function adds the entry but instead of setting a value returns the
 * dictEntry structure to the user, that will make sure to fill the value
 * field as he wishes.
 *
 * This function is also directly exposed to the user API to be called
 * mainly in order to store non-pointers inside the hash value, example:
 *
 * entry = dictAddRaw(dict,mykey,NULL);
 * if (entry != NULL) dictSetSignedIntegerVal(entry,1000);
 *
 * Return values:
 *
 * If key already exists NULL is returned, and "*existing" is populated
 * with the existing entry if existing is not NULL.
 *
 * If key was added, the hash entry is returned to be manipulated by the caller.
 */
dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    dictht *ht;

    if (dictIsRehashing(d)) _dictRehashStep(d);

    /* Get the index of the new element, or -1 if
     * the element already exists. */
    if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
        return NULL;

    /* Allocate the memory and store the new entry.
     * Insert the element in top, with the assumption that in a database
     * system it is more likely that recently added entries are accessed
     * more frequently. */
    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
    entry = zmalloc(sizeof(*entry));
    entry->next = ht->table[index];
    ht->table[index] = entry;
    ht->used++;

    /* Set the hash entry fields. */
    dictSetKey(d, entry, key);
    return entry;
}

#define dictSetVal(d, entry, _val_) do { \
    if ((d)->type->valDup) \
        (entry)->v.val = (d)->type->valDup((d)->privdata, _val_); \
    else \
        (entry)->v.val = (_val_); \
} while(0)

#define dictSetKey(d, entry, _key_) do { \
    if ((d)->type->keyDup) \
        (entry)->key = (d)->type->keyDup((d)->privdata, _key_); \
    else \
        (entry)->key = (_key_); \
} while(0)

/* Returns the index of a free slot that can be populated with
 * a hash entry for the given 'key'.
 * If the key already exists, -1 is returned
 * and the optional output parameter may be filled.
 *
 * Note that if we are in the process of rehashing the hash table, the
 * index is always returned in the context of the second (new) hash table. */
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
    unsigned long idx, table;
    dictEntry *he;
    if (existing) *existing = NULL;

    /* Expand the hash table if needed */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;
    for (table = 0; table <= 1; table++) {
        idx = hash & d->ht[table].sizemask;
        /* Search if this slot does not already contain the given key */
        he = d->ht[table].table[idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                if (existing) *existing = he;
                return -1;
            }
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return idx;
}

#define dictHashKey(d, key) (d)->type->hashFunction(key)

在dict.c/_dictKeyIndex函数中idx = hash & d->ht[table].sizemask可以得到键值对应当存放的序列号,得到序列后,即可向字典中添加元素,添加的时候会检测是否正在rehash以及是否序列号重复的情况,如果正在rehash,那么向第一个dictht中添加,否则向第二个添加。序列号冲突即哈希冲突的解决方案使用的是开链法,将同一个序列号的dictEntry构成一个链表,并且每次都是在链表头部添加元素,所以添加元素的复杂度是*O(1)*。

    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
    entry = zmalloc(sizeof(*entry));
    entry->next = ht->table[index];
    ht->table[index] = entry;
    ht->used++;

rehash过程

rehash的过程比较复杂,主要分为几个主要部分:

  • 触发rehash的条件:受限于size的大小,当哈希表内元素达到一定程度,不可避免的会遇到很多哈希冲突的情况,开链法的使用实际上降低了键值对访问的效率,负载因子可以对哈希值冲突程度做一个量化的评价,负载因子定义为:load_factor = ht[0].used / ht[0].size。根据《Redis设计与实现》一书中的描述,当满足以下任一条件时,启动对字典的扩容或者收缩:
    • 扩容:服务器没有在执行BGSAVE或者BGREWRITEAOF,并且哈希表的负载因子大于等于1
    • 扩容:服务器正在执行BGSAVE或者BGREWRITEAOF,并且哈希表的负载银子大于等于5
    • 收缩:负载因子小于0.1
  • 渐进式rehash:之前有提到,dict结构体中含有两个dictht结构,在rehash开始前,一个作为原有的待复制的哈希表,另一个为空白哈希表;执行rehash时,为了避免长时间的hang在这一个过程中,Redis采取的方式时渐进式的rehash,每次将n个序列转移到ht[1]中,直到最终将ht[0]上的键值完整转移到ht[1]上之后,调换ht[0]与ht[1],并重新设置dict的rehashidx为-1;

  • 执行rehash过程中,对字典其他操作的影响:在执行rehash的过程中,添加新的键值对的时候,只对ht[1]进行操作,而删除、查找、更新则同时对两个表进行操作

字典的遍历

这里还有一个函数,值得花时间讨论一下,那就是dictScan函数,dict在操作的过程中,任何操作,都要注意rehash过程,如何在rehash过程中安全完全的遍历哈希表,是需要花费时间考虑的,我们可以看下redis的实现:

unsigned long dictScan(dict *d,
                       unsigned long v,
                       dictScanFunction *fn,
                       dictScanBucketFunction* bucketfn,
                       void *privdata)
{
    dictht *t0, *t1;
    const dictEntry *de, *next;
    unsigned long m0, m1;

    if (dictSize(d) == 0) return 0;

    if (!dictIsRehashing(d)) {
        t0 = &(d->ht[0]);
        m0 = t0->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Set unmasked bits so incrementing the reversed cursor
         * operates on the masked bits */
        v |= ~m0;

        /* Increment the reverse cursor */
        v = rev(v);
        v++;
        v = rev(v);

    } else {
        t0 = &d->ht[0];
        t1 = &d->ht[1];

        /* Make sure t0 is the smaller and t1 is the bigger table */
        if (t0->size > t1->size) {
            t0 = &d->ht[1];
            t1 = &d->ht[0];
        }

        m0 = t0->sizemask;
        m1 = t1->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Iterate over indices in larger table that are the expansion
         * of the index pointed to by the cursor in the smaller table */
        do {
            /* Emit entries at cursor */
            if (bucketfn) bucketfn(privdata, &t1->table[v & m1]);
            de = t1->table[v & m1];
            while (de) {
                next = de->next;
                fn(privdata, de);
                de = next;
            }

            /* Increment the reverse cursor not covered by the smaller mask.*/
            v |= ~m1;
            v = rev(v);
            v++;
            v = rev(v);

            /* Continue while bits covered by mask difference is non-zero */
        } while (v & (m0 ^ m1));
    }

    return v;
}

关于这一函数,源码上光是注释就有几十行了,详细分析的话,其实可以单独写一篇博客了,这是我看到的一篇比较好的分析博客:Redis字典的遍历;简单来说就是,区别于顺序遍历,Redis的遍历方式为reverse binary iteration(反向二进制法),这篇文章估计是写不下了,总之这个方法可以保证没有被修改的数据一定能被正确遍历,被修改的数据尽可能的减小重复遍历或者漏掉的概率。

其他API函数

其他还有一些字典常见的函数,很好理解,就不多说了。