CPPYY 与 Cython / Pypy / numba的速度对比 PART III
以vec_dot 为例,测试以下几种不同方法的性能
1. 纯python
def vec_dot(vec1, vec2):
if len(vec1) != len(vec2):
return []
size = len(vec1)
ret = []
for i in range(size):
ret.append(vec1[i] * vec2[i])
return ret
import random
import numpy as np
import array
vec1_arr = np.random.rand(1000)
vec2_arr = np.random.rand(1000)
vec1 = vec1_arr.tolist()
vec2 = vec2_arr.tolist()
vec1_parr = array.array("d", vec1)
vec2_parr = array.array("d", vec2)
result = [("测试方法", "耗费时间us")]
%timeit vec_dot(vec1, vec2)
111 µs ± 1.02 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
result.append(("纯python", 111))
# 使用np.ndarray 有额外的开销
%timeit vec_dot(vec1_arr, vec2_arr)
420 µs ± 28.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit vec_dot(vec1_parr, vec2_parr)
164 µs ± 12 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
2. Cython
%load_ext cython
%%cython --cplus
import numpy as np
import cython
# same code as python version
def vec_dot_cython(vec1, vec2):
if len(vec1) != len(vec2):
return []
size = len(vec1)
ret = []
for i in range(size):
ret.append(vec1[i] * vec2[i])
return ret
# cython 的memoryview 可以用numpy.array 来初始化,或者使用python的memoryview/array.array对象
@cython.boundscheck(False)
@cython.wraparound(False)
cdef _vec_dot_cython2(double[:] vec1, double[:] vec2, double[:] ret):
cdef int size = vec1.size, i
if size == vec2.size:
for i in range(size):
ret[i] = vec1[i] * vec2[i]
def vec_dot_cython2(vec1, vec2, ret):
_vec_dot_cython2(vec1, vec2, ret)
结果:
使用cython 而未对python代码做任何改动,就获得了数倍的加速效果
%timeit vec_dot_cython(vec1, vec2)
34.4 µs ± 606 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
result.append(("cython没有类型注释", 34.4))
添加变量类型注释之后的cython结果
ret = np.empty_like(vec1_arr)
%timeit vec_dot_cython2(vec1_arr, vec2_arr, ret)
3.54 µs ± 99.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
result.append(("cython有类型注释", 3.54))
3. Pypy
使用pypy不改变任何代码
%%pypy
def vec_dot(vec1, vec2):
if len(vec1) != len(vec2):
return []
size = len(vec1)
ret = []
for i in range(size):
ret.append(vec1[i] * vec2[i])
return ret
from random import random
from timeit import Timer
vec1 = [random() for _ in range(1000)]
vec2 = [random() for _ in range(1000)]
vec_dot(vec1, vec2)
timer1 = Timer("vec_dot(vec1, vec2)", "from __main__ import vec_dot, vec1, vec2")
print("pypy3: %s us" % (1000 * timer1.timeit(1000)))
pypy3: 12.6938 us
result.append(("pypy", 12.68))
4. numba
numba 简直要超神啊
import numba
@numba.jit(nopython=True)
def vec_dot_numba(vec1, vec2, ret):
if vec1.shape[0] != vec2.shape[0]:
return
size = vec1.shape[0]
for i in range(size):
ret[i] = vec1[i] * vec2[i]
%timeit vec_dot_numba(vec1_arr, vec2_arr, ret)
887 ns ± 13.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
result.append(("numbaJIT", 0.887))
5. cppyy
出乎意料,竟然最慢, 有可能是因为vector太慢的原因
import cppyy
cppyy.cppdef("""
void vec_dot_cppyy(const std::vector<double> &vec1, const std::vector<double> &vec2, std::vector<double> &ret){
if (vec1.size() != vec2.size()) return;
int size = vec1.size();
for (int i=0;i<size;i++){
ret[i] = vec1[i] * vec2[i];
}
}
""")
True
vec_dot_cppyy = cppyy.gbl.vec_dot_cppyy
Vector = cppyy.gbl.std.vector
vec1_cpp = Vector[float](vec1)
vec2_cpp = Vector[float](vec2)
%timeit vec_dot_cppyy(vec1, vec2)
140 µs ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
result.append(("cppyy c++ vector", 140))
6. 看看numpy的速度
%timeit np.dot(vec1_arr, vec2_arr)
1.23 µs ± 8.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
result.append(("numpy", 1.23))
总结
import pandas as pd
df = pd.DataFrame(result[1:], columns=result[0])
df.sort_values(by=result[0][1], inplace=True)
df.index = range(1, 8)
df
- 可以看出来numbaJIT和numpy是一个量级的,速度几乎差不太多
- cython即便没有类型注释,使用原模原样的python代码也可以加速很多
- cython加了类型注释之后,速度也几乎可以达到c的水平
- cppyy这种动态的模块编译方式,性能有损失,当然也可能是因为使用了std::vector