最近看某技术论坛,看到同是后台开发的同学,面试腾讯并回忆了一题关于是否知道memcpy优化相关的。
因为工作原因,是好久没有使用到类似这些库函数,并思考着优化方面,然后带着兴趣去研究下。本身优化这件事情,是需要测试评估的,不过早优化,还是先以程序功能正确和稳定,后面有瓶颈再通过一些工具去压测,找出热点,8-2原则,使用时间复杂度更低的数据结构或算法去代替并回归测试,或以空间换时间。
见过网上大部分资料,包括笔试面试题,有关memcpy/memmove等的主要实现是一个字符一个字符拷贝的,然后早些年工作中使用到dpdk,后者中重新实现了memcpy,大概就是根据特定业务场景优化,所以这里是准备分析下相关的代码实现。
memcpy和memmove的接口描述:
SYNOPSIS
#include <string.h>
void *
memcpy(void *restrict dst, const void *restrict src, size_t n);
DESCRIPTION
The memcpy() function copies n bytes from memory area src to memory area dst. If dst
and src overlap, behavior is undefined. Applications in which dst and src might
overlap should use memmove(3) instead.
SYNOPSIS
#include <string.h>
void *
memmove(void *dst, const void *src, size_t len);
DESCRIPTION
The memmove() function copies len bytes from string src to string dst. The two
strings may overlap; the copy is always done in a non-destructive manner.
这里以glibc-2.27中的实现为说明。以上两个实现,前者没有考虑内存重叠的情况,后者更安全些。
memcpy
26 void *
27 memcpy (void *dstpp, const void *srcpp, size_t len)
28 {
29 unsigned long int dstp = (long int) dstpp;
30 unsigned long int srcp = (long int) srcpp;
31
32 /* Copy from the beginning to the end. */
33
34 /* If there not too few bytes to copy, use word copy. */
35 if (len >= OP_T_THRES)
36 {
37 /* Copy just a few bytes to make DSTP aligned. */
38 len -= (-dstp) % OPSIZ;
39 BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
40
41 /* Copy whole pages from SRCP to DSTP by virtual address manipulation,
42 as much as possible. */
43
44 PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
45
46 /* Copy from SRCP to DSTP taking advantage of the known alignment of
47 DSTP. Number of bytes remaining is put in the third argument,
48 i.e. in LEN. This number may vary from machine to machine. */
49
50 WORD_COPY_FWD (dstp, srcp, len, len);
51
52 /* Fall out and copy the tail. */
53 }
54
55 /* There are just a few bytes to copy. Use byte memory operations. */
56 BYTE_COPY_FWD (dstp, srcp, len);
57
58 return dstpp;
59 }
75 /* Copy exactly NBYTES bytes from SRC_BP to DST_BP,
76 without any assumptions about alignment of the pointers. */
77 #define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \
78 do \
79 { \
80 size_t __nbytes = (nbytes); \
81 while (__nbytes > 0) \
82 { \
83 byte __x = ((byte *) src_bp)[0]; \
84 src_bp += 1; \
85 __nbytes -= 1; \
86 ((byte *) dst_bp)[0] = __x; \
87 dst_bp += 1; \
88 } \
89 } while (0)
这里宏OP_T_THRES以16为例,dpdk中也是16。WORD_COPY_FWD和PAGE_COPY_FWD_MAYBE这里不贴代码,感觉太多了,如注释说明,主要还是对要拷贝的地址进行边界对齐,并利用x86的movsl实现多字节拷贝。
memmove
42 rettype
43 inhibit_loop_to_libcall
44 MEMMOVE (a1const void *a1, a2const void *a2, size_t len)
45 {
46 unsigned long int dstp = (long int) dest;
47 unsigned long int srcp = (long int) src;
48
49 /* This test makes the forward copying code be used whenever possible.
50 Reduces the working set. */
51 if (dstp - srcp >= len) /* *Unsigned* compare! */
52 {
53 /* Copy from the beginning to the end. */
54
55 #if MEMCPY_OK_FOR_FWD_MEMMOVE
56 dest = memcpy (dest, src, len);
57 #else
58 /* If there not too few bytes to copy, use word copy. */
59 if (len >= OP_T_THRES)
61 /* Copy just a few bytes to make DSTP aligned. */
62 len -= (-dstp) % OPSIZ;
63 BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ);
64
65 /* Copy whole pages from SRCP to DSTP by virtual address
66 manipulation, as much as possible. */
67
68 PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len);
69
70 /* Copy from SRCP to DSTP taking advantage of the known
71 alignment of DSTP. Number of bytes remaining is put
72 in the third argument, i.e. in LEN. This number may
73 vary from machine to machine. */
74
75 WORD_COPY_FWD (dstp, srcp, len, len);
76
77 /* Fall out and copy the tail. */
78 }
79
80 /* There are just a few bytes to copy. Use byte memory operations. */
81 BYTE_COPY_FWD (dstp, srcp, len);
82 #endif /* MEMCPY_OK_FOR_FWD_MEMMOVE */
83 }
84 else
85 {
86 /* Copy from the end to the beginning. */
87 srcp += len;
88 dstp += len;
89
90 /* If there not too few bytes to copy, use word copy. */
91 if (len >= OP_T_THRES)
92 {
93 /* Copy just a few bytes to make DSTP aligned. */
94 len -= dstp % OPSIZ;
95 BYTE_COPY_BWD (dstp, srcp, dstp % OPSIZ);
96
97 /* Copy from SRCP to DSTP taking advantage of the known
98 alignment of DSTP. Number of bytes remaining is put
99 in the third argument, i.e. in LEN. This number may
100 vary from machine to machine. */
101
102 WORD_COPY_BWD (dstp, srcp, len, len);
103
104 /* Fall out and copy the tail. */
105 }
106
107 /* There are just a few bytes to copy. Use byte memory operations. */
108 BYTE_COPY_BWD (dstp, srcp, len);
109 }
110
111 RETURN (dest);
112 }
memmove中处理了地址可能重叠的情况,不过这里的实现是可以简化一些。
rte_memcpy
下面贴上dpdk中关于memcpy相关的优化,借用官方的描述:
“不存在一个“最优”的适用于任何场景(硬件+软件+数据)的memcpy实现。这也是DPDK中rte_memcpy存在的原因:不是glibc中的memcpy不够优秀,而是它和DPDK中的核心应用场景之间不合适,有没有觉得这种说法很耳熟?本文将着重探讨如何针对具体应用进行memcpy(或其他任何程序)的性能优化。”
“通常memcpy的性能开销包含:
- 数据的Load/Store
- 附加计算任务(例如地址对齐处理)
- 分支预测
通用的memcpy优化方向:
- 最大限度使用memory/cache带宽(Vector指令、指令级并行)
- Load/Store地址对齐
- 集中顺序访问
- 适当使用non-temporal访存执令
- 适当使用String指令来加速较大的拷贝
最后,所有的指令都经过CPU的流水线执行,因此对流水线效率的分析至关重要,需要优化指令顺序以避免造成流水线阻塞。”
191 static inline void *
192 rte_memcpy(void *dst, const void *src, size_t n)
193 {
194 uintptr_t dstu = (uintptr_t)dst;
195 uintptr_t srcu = (uintptr_t)src;
196 void *ret = dst;
197 size_t dstofss;
198 size_t bits;
199
200 /**
201 * Copy less than 16 bytes
202 */
203 if (n < 16) {
204 if (n & 0x01) {
205 *(uint8_t *)dstu = *(const uint8_t *)srcu;
206 srcu = (uintptr_t)((const uint8_t *)srcu + 1);
207 dstu = (uintptr_t)((uint8_t *)dstu + 1);
208 }
209 if (n & 0x02) {
210 *(uint16_t *)dstu = *(const uint16_t *)srcu;
211 srcu = (uintptr_t)((const uint16_t *)srcu + 1);
212 dstu = (uintptr_t)((uint16_t *)dstu + 1);
213 }
214 if (n & 0x04) {
215 *(uint32_t *)dstu = *(const uint32_t *)srcu;
216 srcu = (uintptr_t)((const uint32_t *)srcu + 1);
217 dstu = (uintptr_t)((uint32_t *)dstu + 1);
218 }
219 if (n & 0x08)
220 *(uint64_t *)dstu = *(const uint64_t *)srcu;
221 return ret;
222 }
当要复制的数据长度小于16字节的时候,是直接通过类型转换赋值。
224 /**
225 * Fast way when copy size doesn't exceed 512 bytes
226 */
227 if (n <= 32) {
228 rte_mov16((uint8_t *)dst, (const uint8_t *)src);
229 rte_mov16((uint8_t *)dst - 16 + n,
230 (const uint8_t *)src - 16 + n);
231 return ret;
232 }
233 if (n <= 64) {
234 rte_mov32((uint8_t *)dst, (const uint8_t *)src);
235 rte_mov32((uint8_t *)dst - 32 + n,
236 (const uint8_t *)src - 32 + n);
237 return ret;
238 }
239 if (n <= 512) {
240 if (n >= 256) {
241 n -= 256;
242 rte_mov256((uint8_t *)dst, (const uint8_t *)src);
243 src = (const uint8_t *)src + 256;
244 dst = (uint8_t *)dst + 256;
245 }
246 if (n >= 128) {
247 n -= 128;
248 rte_mov128((uint8_t *)dst, (const uint8_t *)src);
249 src = (const uint8_t *)src + 128;
250 dst = (uint8_t *)dst + 128;
251 }
252 COPY_BLOCK_128_BACK63:
253 if (n > 64) {
254 rte_mov64((uint8_t *)dst, (const uint8_t *)src);
255 rte_mov64((uint8_t *)dst - 64 + n,
256 (const uint8_t *)src - 64 + n);
257 return ret;
258 }
259 if (n > 0)
260 rte_mov64((uint8_t *)dst - 64 + n,
261 (const uint8_t *)src - 64 + n);
262 return ret;
263 }
76 /**
77 * Copy 16 bytes from one location to another,
78 * locations should not overlap.
79 */
80 static inline void
81 rte_mov16(uint8_t *dst, const uint8_t *src)
82 {
83 __m128i xmm0;
84
85 xmm0 = _mm_loadu_si128((const __m128i *)src);
86 _mm_storeu_si128((__m128i *)dst, xmm0);
87 }
88
93 static inline void
94 rte_mov32(uint8_t *dst, const uint8_t *src)
95 {
96 __m256i ymm0;
97
98 ymm0 = _mm256_loadu_si256((const __m256i *)src);
99 _mm256_storeu_si256((__m256i *)dst, ymm0);
100 }
106 static inline void
107 rte_mov64(uint8_t *dst, const uint8_t *src)
108 {
109 __m512i zmm0;
110
111 zmm0 = _mm512_loadu_si512((const void *)src);
112 _mm512_storeu_si512((void *)dst, zmm0);
113 }
119 static inline void
120 rte_mov128(uint8_t *dst, const uint8_t *src)
121 {
122 rte_mov64(dst + 0 * 64, src + 0 * 64);
123 rte_mov64(dst + 1 * 64, src + 1 * 64);
124 }
当要复制的数据长度小于等于512字节的时候,这里以rte_mov16为例说明SSE2指令的意义。
__m128i表示为128bits的整数;
_mm_loadu_si128表示:Loads 128-bit value;即加载128位值;
_mm_storeu_si128表示:Stores 128-bit value;
Address dst must be 16-byte aligned.
不过这里有个疑问,比如当n=300的时候,先执行rte_mov256复制,然后n=4,然后再执行rte_mov64,此时(uint8_t *)dst - 64 + n
语句把dst往前移(64 - n)个字节单位,那么可能造成重复加载(赋值)(64 - n)*8位?
277 /**
278 * Copy 512-byte blocks.
279 * Use copy block function for better instruction order control,
280 * which is important when load is unaligned.
281 */
282 rte_mov512blocks((uint8_t *)dst, (const uint8_t *)src, n);
283 bits = n;
284 n = n & 511;
285 bits -= n;
286 src = (const uint8_t *)src + bits;
287 dst = (uint8_t *)dst + bits;
288
289 /**
290 * Copy 128-byte blocks.
291 * Use copy block function for better instruction order control,
292 * which is important when load is unaligned.
293 */
294 if (n >= 128) {
295 rte_mov128blocks((uint8_t *)dst, (const uint8_t *)src, n);
296 bits = n;
297 n = n & 127;
298 bits -= n;
299 src = (const uint8_t *)src + bits;
300 dst = (uint8_t *)dst + bits;
301 }
303 /**
304 * Copy whatever left
305 */
306 goto COPY_BLOCK_128_BACK63;
307 }
当超过512字节时,这里先对齐,即先加载64字节,然后把src/dst往后移dstofss个字节单位,从(uint8_t *)dst + dstofss
到(uint8_t *)dst+64
会在后面重新赋值。然后以512个字节为block进行赋值,接着以128字节为单位等。
139 /**
140 * Copy 128-byte blocks from one location to another,
141 * locations should not overlap.
142 */
143 static inline void
144 rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
145 {
146 __m512i zmm0, zmm1;
147
148 while (n >= 128) {
149 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
150 n -= 128;
151 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
152 src = src + 128;
153 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
154 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
155 dst = dst + 128;
156 }
157 }
163 static inline void
164 rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
165 {
166 __m512i zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7;
167
168 while (n >= 512) {
169 zmm0 = _mm512_loadu_si512((const void *)(src + 0 * 64));
170 n -= 512;
171 zmm1 = _mm512_loadu_si512((const void *)(src + 1 * 64));
172 zmm2 = _mm512_loadu_si512((const void *)(src + 2 * 64));
173 zmm3 = _mm512_loadu_si512((const void *)(src + 3 * 64));
174 zmm4 = _mm512_loadu_si512((const void *)(src + 4 * 64));
175 zmm5 = _mm512_loadu_si512((const void *)(src + 5 * 64));
176 zmm6 = _mm512_loadu_si512((const void *)(src + 6 * 64));
177 zmm7 = _mm512_loadu_si512((const void *)(src + 7 * 64));
178 src = src + 512;
179 _mm512_storeu_si512((void *)(dst + 0 * 64), zmm0);
180 _mm512_storeu_si512((void *)(dst + 1 * 64), zmm1);
181 _mm512_storeu_si512((void *)(dst + 2 * 64), zmm2);
182 _mm512_storeu_si512((void *)(dst + 3 * 64), zmm3);
183 _mm512_storeu_si512((void *)(dst + 4 * 64), zmm4);
184 _mm512_storeu_si512((void *)(dst + 5 * 64), zmm5);
185 _mm512_storeu_si512((void *)(dst + 6 * 64), zmm6);
186 _mm512_storeu_si512((void *)(dst + 7 * 64), zmm7);
187 dst = dst + 512;
188 }
189 }
以上是dpdk中针对此业务场景针对性的对memory进行了优化,另外实现中没有处理overlap地址重叠的情况。
参考
大并发服务器内存转换的灵活运用,memcpy的思考
glibc--memcpy源码分析
DPDK中的memcpy性能优化及思考
_mm_loadu_si128