Contents
Redis源码阅读:字典dict的实现
代码版本:Branch 5.0
Github地址:戳我
字典主要数据结构
总体来说,Redis的字典使用哈希表作为底层实现,一个字典包含多个哈希表节点,哈希表节点中存放有键值对。具体的结构,自底层至顶层的定义如下
typedef struct dictEntry {
void *key;
union {
void *val;
uint64_t u64;
int64_t s64;
double d;
} v;
struct dictEntry *next;
} dictEntry;
typedef struct dictType {
uint64_t (*hashFunction)(const void *key);
void *(*keyDup)(void *privdata, const void *key);
void *(*valDup)(void *privdata, const void *obj);
int (*keyCompare)(void *privdata, const void *key1, const void *key2);
void (*keyDestructor)(void *privdata, void *key);
void (*valDestructor)(void *privdata, void *obj);
} dictType;
/* This is our hash table structure. Every dictionary has two of this as we
* implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
dictEntry **table;
unsigned long size;
unsigned long sizemask;
unsigned long used;
} dictht;
typedef struct dict {
dictType *type;
void *privdata;
dictht ht[2];
long rehashidx; /* rehashing not in progress if rehashidx == -1 */
unsigned long iterators; /* number of iterators currently running */
} dict;
typedef struct dictIterator {
dict *d;
long index;
int table, safe;
dictEntry *entry, *nextEntry;
/* unsafe iterator fingerprint for misuse detection. */
long long fingerprint;
} dictIterator;
- dictEntry实际上定义了一种用来存放键值对的数据结构,含有一个可以转换为任意类型指针的void指针用于存放key值,而v属性可以是一个指针,一个64位整形数或者一个双精度浮点数。next指针则指向另一个哈希表节点,当键值冲突时,将相同键值的节点构成一个链表即可。
-
数据结构dictType里面存放了与特定字典相关的属性,即一些处理键值的函数指针
-
dictht数据结构定义了哈希表的结构
- dictEntry **table : 指向一组哈希节点
- size : 哈希表的大小
- sizemask : 掩码,总是等于size-1,用于和哈希值一起计算,得出键的位置
- used : 表示当前已经使用的数量
- dict即字典结构
- type : 指向一个dictType指针,存放处理键的函数
- privdata : 被type中的函数所使用
- ht[2] : 每个字典结构包含两个哈希表,用于rehashing,一个是新表一个是旧表
- rehashidx : rehash过程中的索引,可以表征当前的进度,-1代表还没开始
- iterators表示当前运行的迭代器的数量
创建字典以及哈希算法
创建字典
创建字典有关的几个函数如下所示,即给dict分配空间,传入dictType与privData参数,并依次初始化结构体中的数值。
/* Create a new hash table */
dict *dictCreate(dictType *type,
void *privDataPtr)
{
dict *d = zmalloc(sizeof(*d));
_dictInit(d,type,privDataPtr);
return d;
}
/* Initialize the hash table */
int _dictInit(dict *d, dictType *type,
void *privDataPtr)
{
_dictReset(&d->ht[0]);
_dictReset(&d->ht[1]);
d->type = type;
d->privdata = privDataPtr;
d->rehashidx = -1;
d->iterators = 0;
return DICT_OK;
}
/* Reset a hash table already initialized with ht_init().
* NOTE: This function should only be called by ht_destroy(). */
static void _dictReset(dictht *ht)
{
ht->table = NULL;
ht->size = 0;
ht->sizemask = 0;
ht->used = 0;
}
哈希算法
在字典中添加一对键值对时,Redis根据key值生成了与之对应的哈希值,哈希值与dictht中的sizemask一起决定了键值对应当被存放的位置。值得注意的是,当前版本的Redis所采用的哈希算法为Siphash算法,相比MurmurHash2算法,性能更具优势,具体详见:Siphash Wiki,Redis中Siphash的实现在siphash.c文件中,这里只需要关心如何使用即可,故没有深入研究该算法的实现。
/* Add an element to the target hash table */
int dictAdd(dict *d, void *key, void *val)
{
dictEntry *entry = dictAddRaw(d,key,NULL);
if (!entry) return DICT_ERR;
dictSetVal(d, entry, val);
return DICT_OK;
}
/* Low level add or find:
* This function adds the entry but instead of setting a value returns the
* dictEntry structure to the user, that will make sure to fill the value
* field as he wishes.
*
* This function is also directly exposed to the user API to be called
* mainly in order to store non-pointers inside the hash value, example:
*
* entry = dictAddRaw(dict,mykey,NULL);
* if (entry != NULL) dictSetSignedIntegerVal(entry,1000);
*
* Return values:
*
* If key already exists NULL is returned, and "*existing" is populated
* with the existing entry if existing is not NULL.
*
* If key was added, the hash entry is returned to be manipulated by the caller.
*/
dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing)
{
long index;
dictEntry *entry;
dictht *ht;
if (dictIsRehashing(d)) _dictRehashStep(d);
/* Get the index of the new element, or -1 if
* the element already exists. */
if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1)
return NULL;
/* Allocate the memory and store the new entry.
* Insert the element in top, with the assumption that in a database
* system it is more likely that recently added entries are accessed
* more frequently. */
ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
entry = zmalloc(sizeof(*entry));
entry->next = ht->table[index];
ht->table[index] = entry;
ht->used++;
/* Set the hash entry fields. */
dictSetKey(d, entry, key);
return entry;
}
#define dictSetVal(d, entry, _val_) do { \
if ((d)->type->valDup) \
(entry)->v.val = (d)->type->valDup((d)->privdata, _val_); \
else \
(entry)->v.val = (_val_); \
} while(0)
#define dictSetKey(d, entry, _key_) do { \
if ((d)->type->keyDup) \
(entry)->key = (d)->type->keyDup((d)->privdata, _key_); \
else \
(entry)->key = (_key_); \
} while(0)
/* Returns the index of a free slot that can be populated with
* a hash entry for the given 'key'.
* If the key already exists, -1 is returned
* and the optional output parameter may be filled.
*
* Note that if we are in the process of rehashing the hash table, the
* index is always returned in the context of the second (new) hash table. */
static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing)
{
unsigned long idx, table;
dictEntry *he;
if (existing) *existing = NULL;
/* Expand the hash table if needed */
if (_dictExpandIfNeeded(d) == DICT_ERR)
return -1;
for (table = 0; table <= 1; table++) {
idx = hash & d->ht[table].sizemask;
/* Search if this slot does not already contain the given key */
he = d->ht[table].table[idx];
while(he) {
if (key==he->key || dictCompareKeys(d, key, he->key)) {
if (existing) *existing = he;
return -1;
}
he = he->next;
}
if (!dictIsRehashing(d)) break;
}
return idx;
}
#define dictHashKey(d, key) (d)->type->hashFunction(key)
在dict.c/_dictKeyIndex函数中idx = hash & d->ht[table].sizemask可以得到键值对应当存放的序列号,得到序列后,即可向字典中添加元素,添加的时候会检测是否正在rehash以及是否序列号重复的情况,如果正在rehash,那么向第一个dictht中添加,否则向第二个添加。序列号冲突即哈希冲突的解决方案使用的是开链法,将同一个序列号的dictEntry构成一个链表,并且每次都是在链表头部添加元素,所以添加元素的复杂度是*O(1)*。
ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
entry = zmalloc(sizeof(*entry));
entry->next = ht->table[index];
ht->table[index] = entry;
ht->used++;
rehash过程
rehash的过程比较复杂,主要分为几个主要部分:
- 触发rehash的条件:受限于size的大小,当哈希表内元素达到一定程度,不可避免的会遇到很多哈希冲突的情况,开链法的使用实际上降低了键值对访问的效率,负载因子可以对哈希值冲突程度做一个量化的评价,负载因子定义为:$load_factor = ht[0].used / ht[0].size$。根据《Redis设计与实现》一书中的描述,当满足以下任一条件时,启动对字典的扩容或者收缩:
- 扩容:服务器没有在执行BGSAVE或者BGREWRITEAOF,并且哈希表的负载因子大于等于1
- 扩容:服务器正在执行BGSAVE或者BGREWRITEAOF,并且哈希表的负载银子大于等于5
- 收缩:负载因子小于0.1
- 渐进式rehash:之前有提到,dict结构体中含有两个dictht结构,在rehash开始前,一个作为原有的待复制的哈希表,另一个为空白哈希表;执行rehash时,为了避免长时间的hang在这一个过程中,Redis采取的方式时渐进式的rehash,每次将n个序列转移到ht[1]中,直到最终将ht[0]上的键值完整转移到ht[1]上之后,调换ht[0]与ht[1],并重新设置dict的rehashidx为-1;
- 执行rehash过程中,对字典其他操作的影响:在执行rehash的过程中,添加新的键值对的时候,只对ht[1]进行操作,而删除、查找、更新则同时对两个表进行操作
字典的遍历
这里还有一个函数,值得花时间讨论一下,那就是dictScan函数,dict在操作的过程中,任何操作,都要注意rehash过程,如何在rehash过程中安全完全的遍历哈希表,是需要花费时间考虑的,我们可以看下redis的实现:
unsigned long dictScan(dict *d,
unsigned long v,
dictScanFunction *fn,
dictScanBucketFunction* bucketfn,
void *privdata)
{
dictht *t0, *t1;
const dictEntry *de, *next;
unsigned long m0, m1;
if (dictSize(d) == 0) return 0;
if (!dictIsRehashing(d)) {
t0 = &(d->ht[0]);
m0 = t0->sizemask;
/* Emit entries at cursor */
if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
de = t0->table[v & m0];
while (de) {
next = de->next;
fn(privdata, de);
de = next;
}
/* Set unmasked bits so incrementing the reversed cursor
* operates on the masked bits */
v |= ~m0;
/* Increment the reverse cursor */
v = rev(v);
v++;
v = rev(v);
} else {
t0 = &d->ht[0];
t1 = &d->ht[1];
/* Make sure t0 is the smaller and t1 is the bigger table */
if (t0->size > t1->size) {
t0 = &d->ht[1];
t1 = &d->ht[0];
}
m0 = t0->sizemask;
m1 = t1->sizemask;
/* Emit entries at cursor */
if (bucketfn) bucketfn(privdata, &t0->table[v & m0]);
de = t0->table[v & m0];
while (de) {
next = de->next;
fn(privdata, de);
de = next;
}
/* Iterate over indices in larger table that are the expansion
* of the index pointed to by the cursor in the smaller table */
do {
/* Emit entries at cursor */
if (bucketfn) bucketfn(privdata, &t1->table[v & m1]);
de = t1->table[v & m1];
while (de) {
next = de->next;
fn(privdata, de);
de = next;
}
/* Increment the reverse cursor not covered by the smaller mask.*/
v |= ~m1;
v = rev(v);
v++;
v = rev(v);
/* Continue while bits covered by mask difference is non-zero */
} while (v & (m0 ^ m1));
}
return v;
}
关于这一函数,源码上光是注释就有几十行了,详细分析的话,其实可以单独写一篇博客了,这是我看到的一篇比较好的分析博客:Redis字典的遍历;简单来说就是,区别于顺序遍历,Redis的遍历方式为reverse binary iteration(反向二进制法),这篇文章估计是写不下了,总之这个方法可以保证没有被修改的数据一定能被正确遍历,被修改的数据尽可能的减小重复遍历或者漏掉的概率。
其他API函数
其他还有一些字典常见的函数,很好理解,就不多说了。