Dict对象 PyDictObject
PyDictObject
对象是Python提供的关联式容器。由于Python中大量使用着PyDictObject
,因此,其实现采用了散列表(hash table)来实现。理论上,在最优的情况下,其搜索效率为O(1)
。
散列表简介
散列表的基本思想,是通过一定的函数将要搜索的键值映射为一个整数,并且通过这个整数去访问内存。
但是,随着装入数据的增加,可能存在对不同键值计算的散列值相同的冲突。为解决这一问题,python采用了开放地址的策略。即,当发生冲突时,python会通过一个二次探测函数f
来计算下一个候选位置addr
,如果这个addr
可用,那么就将值放到这个位置,如果不可用,则继续利用f
来探测,以此重复,直至找到可用位置。
同时,使用开放地址策略,还存在一个问题。即,当对数据进行删除时,如果直接删除,将打破由二次探测函数f
生成的探测序列,这导致后续数据不能正确访问。针对这一问题,Python提供了自己的解决方案(将在下文中介绍)。
Dict对象
关联容器的 entry
这里entry
代表着关联容器的一个(键,值)元素
//[dictobject.h]
typedef struct {
/* Cached hash code of me_key. Note that hash codes are C longs.
* We have to use Py_ssize_t instead because dict_popitem() abuses
* me_hash to hold a search finger.
*/
Py_ssize_t me_hash; // me_key的散列值
PyObject *me_key; // 键
PyObject *me_value; // 值
} PyDictEntry;
PyDictObject
中的entry
存在三个状态,即
- Unused态,这时
me_key
和me_value
均为NULL
- Active态,这时
entry
存储了一个(键,值)元素,其me_key
和me_value
都不为NULL
- Dummy态,这是
entry
存储的(键,值)元素被删除的状态。entry
被删除后,状态不能从Active态直接转到Unused态,这是(上文提到)不能打破由二次探测函数f
生成的探测序列。
关联容器的实现
PyDictObject
对象是Python提供的关联式容器。而PyDictObject
对象实际上就是一系列entry
的集合
//[dictobject.h]
#define PyDict_MINSIZE 8
typedef struct _dictobject PyDictObject;
struct _dictobject {
PyObject_HEAD
// 元素个数
Py_ssize_t ma_fill; /* # Active + # Dummy */
// 元素个数
Py_ssize_t ma_used; /* # Active */
/* The table contains ma_mask + 1 slots, and that's a power of 2.
* We store the mask instead of the size because the mask is more
* frequently needed.
*/
// 拥有 entry的数量
Py_ssize_t ma_mask;
/* ma_table points to ma_smalltable for small tables, else to
* additional malloc'ed memory. ma_table is never NULL! This rule
* saves repeated runtime null-tests in the workhorse getitem and
* setitem calls.
*/
// ma_table指向一堆PyDictEntry集合的内存开始地址
// ma_table这个永远不会为NULL
PyDictEntry *ma_table;
PyDictEntry *(*ma_lookup)(PyDictObject *mp, PyObject *key, long hash);
// 默认存在PyDict_MINSIZE个entry
PyDictEntry ma_smalltable[PyDict_MINSIZE];
};
PyDictObject
的创建
//[dictobject.c]
#define INIT_NONZERO_DICT_SLOTS(mp) do { \
(mp)->ma_table = (mp)->ma_smalltable; \
(mp)->ma_mask = PyDict_MINSIZE - 1; \
} while(0)
#define EMPTY_TO_MINSIZE(mp) do { \
memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable)); \
(mp)->ma_used = (mp)->ma_fill = 0; \
INIT_NONZERO_DICT_SLOTS(mp); \
} while(0)
PyObject *
PyDict_New(void)
{
register PyDictObject *mp;
// 创建 dummy 对象
if (dummy == NULL) { /* Auto-initialize dummy */
dummy = PyString_FromString("<dummy key>");
if (dummy == NULL)
return NULL;
#ifdef SHOW_CONVERSION_COUNTS
Py_AtExit(show_counts);
#endif
#ifdef SHOW_ALLOC_COUNT
Py_AtExit(show_alloc);
#endif
#ifdef SHOW_TRACK_COUNT
Py_AtExit(show_track);
#endif
}
if (numfree) {
// 使用缓存池中的对象
mp = free_list[--numfree];
assert (mp != NULL);
assert (Py_TYPE(mp) == &PyDict_Type);
_Py_NewReference((PyObject *)mp);
if (mp->ma_fill) {
EMPTY_TO_MINSIZE(mp);
} else {
/* At least set ma_table and ma_mask; these are wrong
if an empty but presized dict is added to freelist */
INIT_NONZERO_DICT_SLOTS(mp);
}
assert (mp->ma_used == 0);
assert (mp->ma_table == mp->ma_smalltable);
assert (mp->ma_mask == PyDict_MINSIZE - 1);
#ifdef SHOW_ALLOC_COUNT
count_reuse++;
#endif
} else {
// 新建对象
mp = PyObject_GC_New(PyDictObject, &PyDict_Type);
if (mp == NULL)
return NULL;
// 将 ma_smalltable 清零
// 将 ma_used = ma_fill = 0
// 将 ma_table 指向 ma_smalltable
// 将 ma_mask = PyDict_MINSIZE - 1
EMPTY_TO_MINSIZE(mp);
#ifdef SHOW_ALLOC_COUNT
count_alloc++;
#endif
}
// 设定 ma_lookup 为 lookdict_string(搜索算法)
mp->ma_lookup = lookdict_string;
#ifdef SHOW_TRACK_COUNT
count_untracked++;
#endif
#ifdef SHOW_CONVERSION_COUNTS
++created;
#endif
return (PyObject *)mp;
}
元素搜索
Python 为PyDictObject
对象提供了两种搜索策略,lookdict
和lookdict_string
。这两种策略使用了相同的算法,只是lookdict_string
是lookdict
的专门针对键为PyStringObject
的特殊形式。
//[dictobject.c]
static PyDictEntry *
lookdict(PyDictObject *mp, PyObject *key, register long hash)
{
register size_t i;
register size_t perturb;
register PyDictEntry *freeslot;
register size_t mask = (size_t)mp->ma_mask;
PyDictEntry *ep0 = mp->ma_table;
register PyDictEntry *ep;
register int cmp;
PyObject *startkey;
// 散列,定位探测序列的第一个entry
i = (size_t)hash & mask;
ep = &ep0[i];
// ok
if (ep->me_key == NULL || ep->me_key == key)
return ep;
// 第一个entry的状态处于dummy态设置freeslot
if (ep->me_key == dummy)
// 后续,如果找不到,则可直接使用这个freeslot
freeslot = ep;
else {
// 此时,entry的状态为Active
// 比较hash值
if (ep->me_hash == hash) {
startkey = ep->me_key;
Py_INCREF(startkey);
// 比较=>(引用相同,值相同)
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
freeslot = NULL;
}
// 进入第二阶段,在冲突链上检查其他entry
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. */
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
// 链上的下一个entry
// 二次探测函数
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
// 状态判断(Unused?)
if (ep->me_key == NULL)
return freeslot == NULL ? ep : freeslot;
// 引用相同
if (ep->me_key == key)
return ep;
// 值相同
if (ep->me_hash == hash && ep->me_key != dummy) {
startkey = ep->me_key;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash);
}
}
// 设置 freeslot
else if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
}
assert(0); /* NOT REACHED */
return 0;
}
需要注意的是,如果搜索成功,那么返回的是一个有效的entry
。如果不成功,此时的ep
指向一个Unused状态的entry
。这时,程序不能直接返回这个entry
,因为,有可能在这之前程序已经检测到过状态为Dummy的entry
了。所以,检测程序中,加入了变量freeslot
来保存检测到的第一个状态为Dummy的entry
,以备检查失败时返回。
元素插入
PyDictObject
对象的元素插入操作是建立在元素搜索的基础上的。正如,上节中所述,正常情况下,无论搜索是否成功,lookdict
都将返回一个可用的entry
,这是基于这个返回值,实现了元素的插入
//[dictobject.c]
static int
insertdict(register PyDictObject *mp, PyObject *key, long hash, PyObject *value)
{
register PyDictEntry *ep;
assert(mp->ma_lookup != NULL);
ep = mp->ma_lookup(mp, key, hash);
// 检索异常
if (ep == NULL) {
Py_DECREF(key);
Py_DECREF(value);
return -1;
}
return insertdict_by_entry(mp, key, hash, ep, value);
}
//[dictobject.c]
static int
insertdict_by_entry(register PyDictObject *mp, PyObject *key, long hash,
PyDictEntry *ep, PyObject *value)
{
PyObject *old_value;
MAINTAIN_TRACKING(mp, key, value);
// 判断使用的entry的状态 Active?
// 检索成功
if (ep->me_value != NULL) {
// 赋新值
old_value = ep->me_value;
ep->me_value = value;
Py_DECREF(old_value); /* which **CAN** re-enter */
Py_DECREF(key);
}
else {
// 检索失败
if (ep->me_key == NULL)
// 使用的entry的状态为Unused
mp->ma_fill++;
else {
assert(ep->me_key == dummy);
// 使用的entry的状态为Dummy
Py_DECREF(dummy);
}
// 赋新值
ep->me_key = key;
ep->me_hash = (Py_ssize_t)hash;
ep->me_value = value;
mp->ma_used++;
}
return 0;
}
实际,由python直接调用的插入操作,并不是insertdict
,而是PyDict_SetItem
。这里,我们注意到在调用insertdict
前,程序已经计算过了hash
值了,而这个hash
值也正是在PyDict_SetItem
中计算的。
//[dictobject.c]
int
PyDict_SetItem(register PyObject *op, PyObject *key, PyObject *value)
{
register long hash;
// 对象检查
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
assert(value);
// 计算hash值
if (PyString_CheckExact(key)) {
hash = ((PyStringObject *)key)->ob_shash;
if (hash == -1)
hash = PyObject_Hash(key);
}
else {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
return dict_set_item_by_hash_or_entry(op, key, hash, NULL, value);
}
//[dictobject.c]
static int
dict_set_item_by_hash_or_entry(register PyObject *op, PyObject *key,
long hash, PyDictEntry *ep, PyObject *value)
{
register PyDictObject *mp;
register Py_ssize_t n_used;
mp = (PyDictObject *)op;
assert(mp->ma_fill <= mp->ma_mask); /* at least one empty slot */
n_used = mp->ma_used;
Py_INCREF(value);
Py_INCREF(key);
if (ep == NULL) {
// 插入
if (insertdict(mp, key, hash, value) != 0)
return -1;
}
else {
if (insertdict_by_entry(mp, key, hash, ep, value) != 0)
return -1;
}
/* If we added a key, we can safely resize. Otherwise just return!
* If fill >= 2/3 size, adjust size. Normally, this doubles or
* quaduples the size, but it's also possible for the dict to shrink
* (if ma_fill is much larger than ma_used, meaning a lot of dict
* keys have been * deleted).
*
* Quadrupling the size improves average dictionary sparseness
* (reducing collisions) at the cost of some memory and iteration
* speed (which loops over every possible entry). It also halves
* the number of expensive resize operations in a growing dictionary.
*
* Very large dictionaries (over 50K items) use doubling instead.
* This may help applications with severe memory constraints.
*/
// 是否需要调整dict大小
// 阈值 装载率2/3
if (!(mp->ma_used > n_used && mp->ma_fill*3 >= (mp->ma_mask+1)*2))
return 0;
// 调整大小 (调大或调小)
return dictresize(mp, (mp->ma_used > 50000 ? 2 : 4) * mp->ma_used);
}
调整大小实现如下:
//[dictobject.c]
static int
dictresize(PyDictObject *mp, Py_ssize_t minused)
{
Py_ssize_t newsize;
PyDictEntry *oldtable, *newtable, *ep;
Py_ssize_t i;
int is_oldtable_malloced;
PyDictEntry small_copy[PyDict_MINSIZE];
assert(minused >= 0);
/* Find the smallest table size > minused. */
// 确定新table的大小
for (newsize = PyDict_MINSIZE;
newsize <= minused && newsize > 0;
newsize <<= 1)
;
if (newsize <= 0) {
PyErr_NoMemory();
return -1;
}
/* Get space for a new table. */
oldtable = mp->ma_table;
assert(oldtable != NULL);
is_oldtable_malloced = oldtable != mp->ma_smalltable;
// 可以使用 mp->ma_smalltable(这里时为解决调小时,其值小于了PyDict_MINSIZE的情况)
if (newsize == PyDict_MINSIZE) {
/* A large table is shrinking, or we can't get any smaller. */
newtable = mp->ma_smalltable;
if (newtable == oldtable) {
if (mp->ma_fill == mp->ma_used) {
/* No dummies, so no point doing anything. */
return 0;
}
/* We're not going to resize it, but rebuild the
table anyway to purge old dummy entries.
Subtle: This is *necessary* if fill==size,
as lookdict needs at least one virgin slot to
terminate failing searches. If fill < size, it's
merely desirable, as dummies slow searches. */
assert(mp->ma_fill > mp->ma_used);
// 备份旧的table
memcpy(small_copy, oldtable, sizeof(small_copy));
oldtable = small_copy;
}
}
else {
// 不可以使用 mp->ma_smalltable,生成新的table
newtable = PyMem_NEW(PyDictEntry, newsize);
if (newtable == NULL) {
PyErr_NoMemory();
return -1;
}
}
/* Make the dict empty, using the new table. */
// 设置新的table
assert(newtable != oldtable);
mp->ma_table = newtable;
mp->ma_mask = newsize - 1;
memset(newtable, 0, sizeof(PyDictEntry) * newsize);
mp->ma_used = 0;
i = mp->ma_fill;
mp->ma_fill = 0;
/* Copy the data over; this is refcount-neutral for active entries;
dummy entries aren't copied over, of course */
// 将entry拷贝到新的table中
for (ep = oldtable; i > 0; ep++) {
if (ep->me_value != NULL) { /* active entry */
--i;
insertdict_clean(mp, ep->me_key, (long)ep->me_hash,
ep->me_value);
}
else if (ep->me_key != NULL) { /* dummy entry */
--i;
assert(ep->me_key == dummy);
// 旧table中状态为Dummy的entry可以直接丢弃,而不需要加到新table中
// 调整状态为Dummy的entry的引用计数
Py_DECREF(ep->me_key);
}
/* else key == value == NULL: nothing to do */
}
// 必要时,释放旧table
if (is_oldtable_malloced)
PyMem_DEL(oldtable);
return 0;
}
元素删除
同元素的插入类似,元素删除操作是建立在元素搜索的基础上的。通过lookdict
进行检索,如果检索成功,返回对应的entry
,这时,只需将这个entry
的状态修改为Dummy。
//[dictobject.c]
int
PyDict_DelItem(PyObject *op, PyObject *key)
{
register PyDictObject *mp;
register long hash;
register PyDictEntry *ep;
PyObject *old_value, *old_key;
if (!PyDict_Check(op)) {
PyErr_BadInternalCall();
return -1;
}
assert(key);
// 计算hash值
if (!PyString_CheckExact(key) ||
(hash = ((PyStringObject *) key)->ob_shash) == -1) {
hash = PyObject_Hash(key);
if (hash == -1)
return -1;
}
mp = (PyDictObject *)op;
// 调用lookdict进行检索
ep = (mp->ma_lookup)(mp, key, hash);
if (ep == NULL)
return -1;
if (ep->me_value == NULL) {
// 没有检索到对应的key(即返回的entry不是Active)
set_key_error(key);
return -1;
}
// 检索成功
// 将返回的entry状态由Active置为Dummy
old_key = ep->me_key;
Py_INCREF(dummy);
ep->me_key = dummy;
old_value = ep->me_value;
ep->me_value = NULL;
mp->ma_used--;
Py_DECREF(old_value);
Py_DECREF(old_key);
return 0;
}
PyDictObject
对象的缓存池
其缓存池的定义
//[dictobject.c]
#ifndef PyDict_MAXFREELIST
#define PyDict_MAXFREELIST 80
#endif
static PyDictObject *free_list[PyDict_MAXFREELIST];
static int numfree = 0;
PyDictObject
对象的缓存池的使用与PyListObject
类似,在刚开始时,缓存池中并没有缓冲有对象,其缓存的对象都是在PyDictObject
对象释放时加入到缓存池中的。
//[dictobject.c]
static void
dict_dealloc(register PyDictObject *mp)
{
register PyDictEntry *ep;
Py_ssize_t fill = mp->ma_fill;
PyObject_GC_UnTrack(mp);
Py_TRASHCAN_SAFE_BEGIN(mp)
// 释放掉ma_table中的entry
for (ep = mp->ma_table; fill > 0; ep++) {
if (ep->me_key) {
--fill;
Py_DECREF(ep->me_key);
Py_XDECREF(ep->me_value);
}
}
if (mp->ma_table != mp->ma_smalltable)
// 释放掉堆上的table
PyMem_DEL(mp->ma_table);
// 缓冲池是否已被填满
if (numfree < PyDict_MAXFREELIST && Py_TYPE(mp) == &PyDict_Type)
// 加入缓冲池
free_list[numfree++] = mp;
else
// 直接释放掉
Py_TYPE(mp)->tp_free((PyObject *)mp);
Py_TRASHCAN_SAFE_END(mp)
}
参考
《Python 源码剖析》