Contents

Redis源码阅读：字典dict的实现

代码版本：Branch 5.0
Github地址：戳我

字典主要数据结构

总体来说，Redis的字典使用哈希表作为底层实现，一个字典包含多个哈希表节点，哈希表节点中存放有键值对。具体的结构，自底层至顶层的定义如下

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
} dictht;

typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    long long fingerprint;
} dictIterator;

dictEntry实际上定义了一种用来存放键值对的数据结构，含有一个可以转换为任意类型指针的void指针用于存放key值，而v属性可以是一个指针，一个64位整形数或者一个双精度浮点数。next指针则指向另一个哈希表节点，当键值冲突时，将相同键值的节点构成一个链表即可。
数据结构dictType里面存放了与特定字典相关的属性，即一些处理键值的函数指针
dictht数据结构定义了哈希表的结构
- dictEntry **table ：指向一组哈希节点
- size : 哈希表的大小
- sizemask : 掩码，总是等于size-1，用于和哈希值一起计算，得出键的位置
- used : 表示当前已经使用的数量
dict即字典结构
- type : 指向一个dictType指针，存放处理键的函数
- privdata : 被type中的函数所使用
- ht[2] ：每个字典结构包含两个哈希表，用于rehashing，一个是新表一个是旧表
- rehashidx : rehash过程中的索引，可以表征当前的进度，-1代表还没开始
- iterators表示当前运行的迭代器的数量

创建字典以及哈希算法

创建字典

创建字典有关的几个函数如下所示，即给dict分配空间，传入dictType与privData参数，并依次初始化结构体中的数值。

/* Create a new hash table */
dict *dictCreate(dictType *type,
        void *privDataPtr)
{
    dict *d = zmalloc(sizeof(*d));

    _dictInit(d,type,privDataPtr);
    return d;
}

/* Initialize the hash table */
int _dictInit(dict *d, dictType *type,
        void *privDataPtr)
{
    _dictReset(&d->ht[0]);
    _dictReset(&d->ht[1]);
    d->type = type;
    d->privdata = privDataPtr;
    d->rehashidx = -1;
    d->iterators = 0;
    return DICT_OK;
}

/* Reset a hash table already initialized with ht_init().
 * NOTE: This function should only be called by ht_destroy(). */
static void _dictReset(dictht *ht)
{
    ht->table = NULL;
    ht->size = 0;
    ht->sizemask = 0;
    ht->used = 0;
}

哈希算法

在字典中添加一对键值对时，Redis根据key值生成了与之对应的哈希值，哈希值与dictht中的sizemask一起决定了键值对应当被存放的位置。值得注意的是，当前版本的Redis所采用的哈希算法为Siphash算法，相比MurmurHash2算法，性能更具优势，具体详见：Siphash Wiki,Redis中Siphash的实现在siphash.c文件中，这里只需要关心如何使用即可，故没有深入研究该算法的实现。

/* Add an element to the target hash table */
int dictAdd(dict *d, void *key, void *val)
{
    dictEntry *entry = dictAddRaw(d,key,NULL);

    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}

/* Low level add or find:
 * This function adds the entry but instead of setting a value returns the
 * dictEntry structure to the user, that will make sure to fill the value
 * field as he wishes.
 *
 * This function is also directly exposed to the user API to be called
 * mainly in order to store non-pointers inside the hash value, example:
 *
 * entry = dictAddRaw(dict,mykey,NULL);
 * if (entry != NULL) dictSetSignedIntegerVal(entry,1000);
 *
 * Return values:
 *
 * If key already exists NULL is returned, and "*existing" is populated
 * with the existing entry if existing is not NULL.
 *
 * If key was added, the hash entry is returned to be manipulated by the caller.
 */
dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
    long index;
    dictEntry *entry;
    dictht *ht;

    if (dictIsRehashing(d)) _dictRehashStep(d);

    /* Get the index of the new element, or -1 if
     * the element already exists. */
    if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
        return NULL;

    /* Allocate the memory and store the new entry.
     * Insert the element in top, with the assumption that in a database
     * system it is more likely that recently added entries are accessed
     * more frequently. */
    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
    entry = zmalloc(sizeof(*entry));
    entry->next = ht->table[index];
    ht->table[index] = entry;
    ht->used++;

    /* Set the hash entry fields. */
    dictSetKey(d, entry, key);
    return entry;
}

#define dictSetVal(d, entry, _val_) do { \
    if ((d)->type->valDup) \
        (entry)->v.val = (d)->type->valDup((d)->privdata, _val_); \
    else \
        (entry)->v.val = (_val_); \
} while(0)

#define dictSetKey(d, entry, _key_) do { \
    if ((d)->type->keyDup) \
        (entry)->key = (d)->type->keyDup((d)->privdata, _key_); \
    else \
        (entry)->key = (_key_); \
} while(0)

/* Returns the index of a free slot that can be populated with
 * a hash entry for the given 'key'.
 * If the key already exists, -1 is returned
 * and the optional output parameter may be filled.
 *
 * Note that if we are in the process of rehashing the hash table, the
 * index is always returned in the context of the second (new) hash table. */
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
    unsigned long idx, table;
    dictEntry *he;
    if (existing) *existing = NULL;

    /* Expand the hash table if needed */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return -1;
    for (table = 0; table <= 1; table++) {
        idx = hash & d->ht[table].sizemask;
        /* Search if this slot does not already contain the given key */
        he = d->ht[table].table[idx];
        while(he) {
            if (key==he->key || dictCompareKeys(d, key, he->key)) {
                if (existing) *existing = he;
                return -1;
            }
            he = he->next;
        }
        if (!dictIsRehashing(d)) break;
    }
    return idx;
}

#define dictHashKey(d, key) (d)->type->hashFunction(key)

在dict.c/_dictKeyIndex函数中idx = hash & d->ht[table].sizemask可以得到键值对应当存放的序列号，得到序列后，即可向字典中添加元素,添加的时候会检测是否正在rehash以及是否序列号重复的情况，如果正在rehash，那么向第一个dictht中添加，否则向第二个添加。序列号冲突即哈希冲突的解决方案使用的是开链法，将同一个序列号的dictEntry构成一个链表，并且每次都是在链表头部添加元素，所以添加元素的复杂度是*O(1)*。

    ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
    entry = zmalloc(sizeof(*entry));
    entry->next = ht->table[index];
    ht->table[index] = entry;
    ht->used++;

rehash过程

rehash的过程比较复杂，主要分为几个主要部分：

触发rehash的条件：受限于size的大小，当哈希表内元素达到一定程度，不可避免的会遇到很多哈希冲突的情况，开链法的使用实际上降低了键值对访问的效率，负载因子可以对哈希值冲突程度做一个量化的评价，负载因子定义为：$load_factor = ht[0].used / ht[0].size$。根据《Redis设计与实现》一书中的描述，当满足以下任一条件时，启动对字典的扩容或者收缩：
- 扩容：服务器没有在执行BGSAVE或者BGREWRITEAOF，并且哈希表的负载因子大于等于1
- 扩容：服务器正在执行BGSAVE或者BGREWRITEAOF，并且哈希表的负载银子大于等于5
- 收缩：负载因子小于0.1
渐进式rehash：之前有提到，dict结构体中含有两个dictht结构，在rehash开始前，一个作为原有的待复制的哈希表，另一个为空白哈希表；执行rehash时，为了避免长时间的hang在这一个过程中，Redis采取的方式时渐进式的rehash，每次将n个序列转移到ht[1]中，直到最终将ht[0]上的键值完整转移到ht[1]上之后，调换ht[0]与ht[1]，并重新设置dict的rehashidx为-1；
执行rehash过程中，对字典其他操作的影响：在执行rehash的过程中，添加新的键值对的时候，只对ht[1]进行操作，而删除、查找、更新则同时对两个表进行操作

字典的遍历

这里还有一个函数，值得花时间讨论一下，那就是dictScan函数，dict在操作的过程中，任何操作，都要注意rehash过程，如何在rehash过程中安全完全的遍历哈希表，是需要花费时间考虑的，我们可以看下redis的实现：

unsigned long dictScan(dict *d,
                       unsigned long v,
                       dictScanFunction *fn,
                       dictScanBucketFunction* bucketfn,
                       void *privdata)
{
    dictht *t0, *t1;
    const dictEntry *de, *next;
    unsigned long m0, m1;

    if (dictSize(d) == 0) return 0;

    if (!dictIsRehashing(d)) {
        t0 = &(d->ht[0]);
        m0 = t0->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Set unmasked bits so incrementing the reversed cursor
         * operates on the masked bits */
        v |= ~m0;

        /* Increment the reverse cursor */
        v = rev(v);
        v++;
        v = rev(v);

    } else {
        t0 = &d->ht[0];
        t1 = &d->ht[1];

        /* Make sure t0 is the smaller and t1 is the bigger table */
        if (t0->size > t1->size) {
            t0 = &d->ht[1];
            t1 = &d->ht[0];
        }

        m0 = t0->sizemask;
        m1 = t1->sizemask;

        /* Emit entries at cursor */
        if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
        de = t0->table[v & m0];
        while (de) {
            next = de->next;
            fn(privdata, de);
            de = next;
        }

        /* Iterate over indices in larger table that are the expansion
         * of the index pointed to by the cursor in the smaller table */
        do {
            /* Emit entries at cursor */
            if (bucketfn) bucketfn(privdata, &t1->table[v & m1]);
            de = t1->table[v & m1];
            while (de) {
                next = de->next;
                fn(privdata, de);
                de = next;
            }

            /* Increment the reverse cursor not covered by the smaller mask.*/
            v |= ~m1;
            v = rev(v);
            v++;
            v = rev(v);

            /* Continue while bits covered by mask difference is non-zero */
        } while (v & (m0 ^ m1));
    }

    return v;
}

关于这一函数，源码上光是注释就有几十行了，详细分析的话，其实可以单独写一篇博客了，这是我看到的一篇比较好的分析博客：Redis字典的遍历;简单来说就是，区别于顺序遍历，Redis的遍历方式为reverse binary iteration(反向二进制法)，这篇文章估计是写不下了，总之这个方法可以保证没有被修改的数据一定能被正确遍历，被修改的数据尽可能的减小重复遍历或者漏掉的概率。

其他API函数

其他还有一些字典常见的函数，很好理解，就不多说了。

浏览量: 3,540