字符串对象 PyStringObject
PyStringObject
是对字符串对象的实现,其具有可变长度的内存空间。即,不同字符串的PyStringObject
对象其内存长度可能不同,例如“ni”,“hao”这两个字符串就具有不同的内存长度。同时,PyStringObject
又是一个值不可变对象,即一旦创建后,其值一直将保持创建时的值。
定义
[stringobject.h]
typedef struct {
PyObject_VAR_HEAD //其中的ob_size保存了其可变内存的大小
long ob_shash; //缓存对象的hash值(未计算时,默认-1)
int ob_sstate;
char ob_sval[1]; //字符的实际指针
} PyStringObject;
实际上,其实际的字符长度是由ob_size
保存的,满足ob_sval[ob_size] == '\0'
计算字符串对象的hash值如下
[stringobject.h]
static long
string_hash(PyStringObject *a)
{
register Py_ssize_t len;
register unsigned char *p;
register long x;
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
if (a->ob_shash != -1)
return a->ob_shash;
len = Py_SIZE(a);
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
if (len == 0) {
a->ob_shash = 0;
return 0;
}
p = (unsigned char *) a->ob_sval;
x = _Py_HashSecret.prefix;
x ^= *p << 7;
while (--len >= 0)
x = (1000003*x) ^ *p++;
x ^= Py_SIZE(a);
x ^= _Py_HashSecret.suffix;
if (x == -1)
x = -2;
a->ob_shash = x;
return x;
}
PyStringObject
类型,对应的对象类型为PyString_Type
创建PyStringObject
对象
python提供了两种方法从C中原生的字符串创建PyStringObject
对象
- 利用
PyString_FromString
[stringobject.h]
PyObject *
PyString_FromString(const char *str)
{
register size_t size;
register PyStringObject *op;
assert(str != NULL);
size = strlen(str);
// 判断字符串的长度是否超过限制
if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
PyErr_SetString(PyExc_OverflowError,
"string is too long for a Python string");
return NULL;
}
if (size == 0 && (op = nullstring) != NULL) {
// 返回空字符串 这里的nullstring 已经创建好了
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size == 1 && (op = characters[*str & UCHAR_MAX]) != NULL) {
// 返回相应的单字符串,这里的单字符串也是已经创建好的
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
/* Inline PyObject_NewVar */
//申请内存,注意这里的大小是 PyStringObject_SIZE + size
op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyString_Type, size);
op->ob_shash = -1;
op->ob_sstate = SSTATE_NOT_INTERNED;
Py_MEMCPY(op->ob_sval, str, size+1);
/* share short strings */
if (size == 0) {
// 第一次创建空字符串
// 这是将保留这个字符串,以备以后再次创建时直接使用
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
nullstring = op;
Py_INCREF(op);
} else if (size == 1) {
// 第一次创建单字符字符串
// 这是将保留这单字符个字符串(到数组中),以备以后再次创建时直接使用
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
- 利用
PyString_FromStringAndSize
[stringobject.h]
PyObject *
PyString_FromStringAndSize(const char *str, Py_ssize_t size)
{
register PyStringObject *op;
if (size < 0) {
PyErr_SetString(PyExc_SystemError,
"Negative size passed to PyString_FromStringAndSize");
return NULL;
}
if (size == 0 && (op = nullstring) != NULL) {
#ifdef COUNT_ALLOCS
null_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size == 1 && str != NULL &&
(op = characters[*str & UCHAR_MAX]) != NULL)
{
#ifdef COUNT_ALLOCS
one_strings++;
#endif
Py_INCREF(op);
return (PyObject *)op;
}
if (size > PY_SSIZE_T_MAX - PyStringObject_SIZE) {
PyErr_SetString(PyExc_OverflowError, "string is too large");
return NULL;
}
/* Inline PyObject_NewVar */
op = (PyStringObject *)PyObject_MALLOC(PyStringObject_SIZE + size);
if (op == NULL)
return PyErr_NoMemory();
(void)PyObject_INIT_VAR(op, &PyString_Type, size);
op->ob_shash = -1;
op->ob_sstate = SSTATE_NOT_INTERNED;
if (str != NULL)
Py_MEMCPY(op->ob_sval, str, size);
op->ob_sval[size] = '\0';
/* share short strings */
if (size == 0) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
nullstring = op;
Py_INCREF(op);
} else if (size == 1 && str != NULL) {
PyObject *t = (PyObject *)op;
PyString_InternInPlace(&t);
op = (PyStringObject *)t;
characters[*str & UCHAR_MAX] = op;
Py_INCREF(op);
}
return (PyObject *) op;
}
PyString_FromString
传入的参数必须是以 NULL(‘\0’)
结尾的字符数组的指针,而 PyString_FromStringAndSize
不会有这样的要求,因为通过传入的 size 参数就可以确定需要拷贝的字符的个数.
字符串对象的intern机制
在PyString_FromString
与PyString_FromStringAndSize
中当size == 0
或size==1
时,都使用了函数PyString_InternInPlace
。其作用是保存常用字符串的对象,以备下次直接使用
[stringobject.h]
void
PyString_InternInPlace(PyObject **p)
{
register PyStringObject *s = (PyStringObject *)(*p);
PyObject *t;
if (s == NULL || !PyString_Check(s))
Py_FatalError("PyString_InternInPlace: strings only please!");
/* If it's a string subclass, we don't really know what putting
it in the interned dict might do. */
//进行类型以及状态的检测
if (!PyString_CheckExact(s))
return;
if (PyString_CHECK_INTERNED(s))
return;
//创建实现Intern机制的缓存字典
if (interned == NULL) {
interned = PyDict_New();
if (interned == NULL) {
PyErr_Clear(); /* Don't leave an exception */
return;
}
}
//从字典中获取对象
t = PyDict_GetItem(interned, (PyObject *)s);
if (t) {
// 存在,则增加引用计数
Py_INCREF(t);
Py_SETREF(*p, t);
return;
}
//不存在,则向字典中添加
if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
PyErr_Clear();
return;
}
/* The two references in interned are not counted by refcnt.
The string deallocator will take care of this */
//调整其计数
Py_REFCNT(s) -= 2;
//调整其状态
PyString_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}
字符串的连接问题
虽然,python的PyStringObjec
t提供了+
来实现字符串的连接,但是,其实现是通过创建新的PyStringObject
来容纳这个和,也就是,如果要实现N个对象的连接,利用+
将分配N-1次内存。
官方推荐的是利用PyStringObject
的join
操作来实现list
或tuple
的连接,这样只需要一次内存分配
例如
a = 'abc'
b = 'def'
c = ''.join((a,b))
参考
《Python 源码剖析》