内存拷贝优化(1)-小内存拷贝优化
source link: https://www.skywind.me/blog/archives/143
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
内存拷贝优化(1)-小内存拷贝优化
相信大家代码里有很多地方用到memcpy这个函数,相信这个函数的占用是不小的,有时优化了memcpy,能使整个项目的运行效率提升。通过适当的编码技巧,让我们的内存拷贝速度超过memcpy两倍,是可以实现的。
有人说memcpy还能优化么?不就是rep movsd么?CPU和内存之间的带宽是固定的,怎么可能优化呢?其实是普通的内存拷贝并没有发挥全部的带宽,很多被浪费掉了,比如要等到数据完全读取成功后再去写入,然后要写入成功后再去读取新的。而优化本身就是使这两者尽量的并行。发挥最大的带宽。
现代的内存拷贝都需要判断内存大小,并按照大小选择不同策略进行拷贝,比如大内存拷贝(超过cache大小),那么最好使用并行若干读取指令和写入指令,然后再并行写入,使得CPU前后结果依赖得以大大降低,并且使用缓冲预取,再CPU处理数据之前,就把数据放到离CPU最近的CACHE。这样已经可以比memcpy快很多了,如果再加上一些新指令的帮助,大内存拷贝会大大提速。
但是用同样的代码去拷贝小内存,因为额外的开销,难对齐的内存,准备工作一大堆,如果实际要拷贝的内存很小的话,这样的工作开销还比直接按照 dword复制慢很多。在VC10的memcpy实现中将内存按照128个字节大小进行区分,小于该大小的使用普通拷贝,大于该大小的使用普通SSE指令拷贝,而现在我们就要来挑战VC10的memcpy,本篇先叙述小内存拷贝部分。
适合拷贝64字节以内的数据量。原理很简单,LOOP UNROLL。rep movsb/movsd是靠不住的,小内存拷贝还是得展开循环。废话不多说,代码贴上:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <mmsystem.h>
#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif
void *memcpy_tiny(void *dst, const void *src, size_t len)
{
if (len <= 63) {
register unsigned char *dd = (unsigned char*)dst + len;
register const unsigned char *ss = (const unsigned char*)src + len;
switch (len)
{
case 68: *((int*)(dd - 68)) = *((int*)(ss - 68));
case 64: *((int*)(dd - 64)) = *((int*)(ss - 64));
case 60: *((int*)(dd - 60)) = *((int*)(ss - 60));
case 56: *((int*)(dd - 56)) = *((int*)(ss - 56));
case 52: *((int*)(dd - 52)) = *((int*)(ss - 52));
case 48: *((int*)(dd - 48)) = *((int*)(ss - 48));
case 44: *((int*)(dd - 44)) = *((int*)(ss - 44));
case 40: *((int*)(dd - 40)) = *((int*)(ss - 40));
case 36: *((int*)(dd - 36)) = *((int*)(ss - 36));
case 32: *((int*)(dd - 32)) = *((int*)(ss - 32));
case 28: *((int*)(dd - 28)) = *((int*)(ss - 28));
case 24: *((int*)(dd - 24)) = *((int*)(ss - 24));
case 20: *((int*)(dd - 20)) = *((int*)(ss - 20));
case 16: *((int*)(dd - 16)) = *((int*)(ss - 16));
case 12: *((int*)(dd - 12)) = *((int*)(ss - 12));
case 8: *((int*)(dd - 8)) = *((int*)(ss - 8));
case 4: *((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 67: *((int*)(dd - 67)) = *((int*)(ss - 67));
case 63: *((int*)(dd - 63)) = *((int*)(ss - 63));
case 59: *((int*)(dd - 59)) = *((int*)(ss - 59));
case 55: *((int*)(dd - 55)) = *((int*)(ss - 55));
case 51: *((int*)(dd - 51)) = *((int*)(ss - 51));
case 47: *((int*)(dd - 47)) = *((int*)(ss - 47));
case 43: *((int*)(dd - 43)) = *((int*)(ss - 43));
case 39: *((int*)(dd - 39)) = *((int*)(ss - 39));
case 35: *((int*)(dd - 35)) = *((int*)(ss - 35));
case 31: *((int*)(dd - 31)) = *((int*)(ss - 31));
case 27: *((int*)(dd - 27)) = *((int*)(ss - 27));
case 23: *((int*)(dd - 23)) = *((int*)(ss - 23));
case 19: *((int*)(dd - 19)) = *((int*)(ss - 19));
case 15: *((int*)(dd - 15)) = *((int*)(ss - 15));
case 11: *((int*)(dd - 11)) = *((int*)(ss - 11));
case 7: *((int*)(dd - 7)) = *((int*)(ss - 7));
*((int*)(dd - 4)) = *((int*)(ss - 4));
break;
case 3: *((short*)(dd - 3)) = *((short*)(ss - 3));
dd[-1] = ss[-1];
break;
case 66: *((int*)(dd - 66)) = *((int*)(ss - 66));
case 62: *((int*)(dd - 62)) = *((int*)(ss - 62));
case 58: *((int*)(dd - 58)) = *((int*)(ss - 58));
case 54: *((int*)(dd - 54)) = *((int*)(ss - 54));
case 50: *((int*)(dd - 50)) = *((int*)(ss - 50));
case 46: *((int*)(dd - 46)) = *((int*)(ss - 46));
case 42: *((int*)(dd - 42)) = *((int*)(ss - 42));
case 38: *((int*)(dd - 38)) = *((int*)(ss - 38));
case 34: *((int*)(dd - 34)) = *((int*)(ss - 34));
case 30: *((int*)(dd - 30)) = *((int*)(ss - 30));
case 26: *((int*)(dd - 26)) = *((int*)(ss - 26));
case 22: *((int*)(dd - 22)) = *((int*)(ss - 22));
case 18: *((int*)(dd - 18)) = *((int*)(ss - 18));
case 14: *((int*)(dd - 14)) = *((int*)(ss - 14));
case 10: *((int*)(dd - 10)) = *((int*)(ss - 10));
case 6: *((int*)(dd - 6)) = *((int*)(ss - 6));
case 2: *((short*)(dd - 2)) = *((short*)(ss - 2));
break;
case 65: *((int*)(dd - 65)) = *((int*)(ss - 65));
case 61: *((int*)(dd - 61)) = *((int*)(ss - 61));
case 57: *((int*)(dd - 57)) = *((int*)(ss - 57));
case 53: *((int*)(dd - 53)) = *((int*)(ss - 53));
case 49: *((int*)(dd - 49)) = *((int*)(ss - 49));
case 45: *((int*)(dd - 45)) = *((int*)(ss - 45));
case 41: *((int*)(dd - 41)) = *((int*)(ss - 41));
case 37: *((int*)(dd - 37)) = *((int*)(ss - 37));
case 33: *((int*)(dd - 33)) = *((int*)(ss - 33));
case 29: *((int*)(dd - 29)) = *((int*)(ss - 29));
case 25: *((int*)(dd - 25)) = *((int*)(ss - 25));
case 21: *((int*)(dd - 21)) = *((int*)(ss - 21));
case 17: *((int*)(dd - 17)) = *((int*)(ss - 17));
case 13: *((int*)(dd - 13)) = *((int*)(ss - 13));
case 9: *((int*)(dd - 9)) = *((int*)(ss - 9));
case 5: *((int*)(dd - 5)) = *((int*)(ss - 5));
case 1: dd[-1] = ss[-1];
break;
case 0:
default: break;
}
return dd;
} else {
return memcpy(dst, src, len);
}
}
typedef void *(*MemCpyProc)(void *dst, const void *src, size_t size);
void benchmark(MemCpyProc memcpy1, MemCpyProc memcpy2, int x1, int x2)
{
static unsigned char card1[0x20015];
static unsigned char card2[0x20015];
char *align1 = (char*)card1 + ((16 - ((size_t)card1 & 15)) & 15);
char *align2 = (char*)card2 + ((16 - ((size_t)card2 & 15)) & 15);
long totalsize = 0x20000000;
int size, t1, t2, i;
for (size = 64; size >= 8; size = size * 2 / 3) {
int times = totalsize / size;
printf("memcpy: size=%6d\t times=%8d:\t ", size, times);
Sleep(50);
t1 = timeGetTime();
for (i = 0; i < times; i++)
memcpy1(align1 + x1, align2 + x2, size);
t1 = timeGetTime() - t1;
Sleep(50);
t2 = timeGetTime();
for (i = 0; i < times; i++)
memcpy2(align1 + x1, align2 + x2, size);
t2 = timeGetTime() - t2;
printf("t1=%4d\t t2=%4d\n", t1, t2);
}
}
//! flag: -O3
//! exe:
int main(void)
{
printf("\ndst: aligned src: aligned\n");
benchmark(memcpy, memcpy_tiny, 0, 0);
printf("\ndst: un-aligned src: aligned\n");
benchmark(memcpy, memcpy_tiny, 1, 0);
printf("\ndst: aligned src: un-aligned\n");
benchmark(memcpy, memcpy_tiny, 0, 1);
printf("\ndst: un-aligned src: un-aligned\n");
benchmark(memcpy, memcpy_tiny, 3, 1);
return 0;
}
该代码展开了循环,效率比rep movsd之类的高2-3倍,比memcpy平均快2倍,下面是测试数据,t1代表memcpy, t2代表memcpy_tiny,使用VC2010最大优化模式编译:
dst: aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 239 t2= 243
memcpy: size= 42 times=12782640: t1= 341 t2= 170
memcpy: size= 28 times=19173961: t1= 225 t2= 205
memcpy: size= 18 times=29826161: t1= 363 t2= 232
memcpy: size= 12 times=44739242: t1= 448 t2= 294
memcpy: size= 8 times=67108864: t1= 699 t2= 406
dst: un-aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 255 t2= 256
memcpy: size= 42 times=12782640: t1= 369 t2= 269
memcpy: size= 28 times=19173961: t1= 289 t2= 165
memcpy: size= 18 times=29826161: t1= 421 t2= 203
memcpy: size= 12 times=44739242: t1= 493 t2= 279
memcpy: size= 8 times=67108864: t1= 956 t2= 354
dst: aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 241 t2= 257
memcpy: size= 42 times=12782640: t1= 303 t2= 218
memcpy: size= 28 times=19173961: t1= 258 t2= 181
memcpy: size= 18 times=29826161: t1= 319 t2= 261
memcpy: size= 12 times=44739242: t1= 448 t2= 325
memcpy: size= 8 times=67108864: t1= 646 t2= 401
dst: un-aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 243 t2= 257
memcpy: size= 42 times=12782640: t1= 329 t2= 273
memcpy: size= 28 times=19173961: t1= 276 t2= 193
memcpy: size= 18 times=29826161: t1= 336 t2= 249
memcpy: size= 12 times=44739242: t1= 529 t2= 332
memcpy: size= 8 times=67108864: t1= 915 t2= 368
可见平均在64,42,28,18,12,8等几个大小的测试中,平均速度是memcpy的两倍。主要是依靠跳转表来实现的(VC INLINE ASM不能描述跳转表,因此该部分无法写成inline asm模式,索幸switch /case在大部分C编译器中都是用跳转表来实现的,因此我们可以用switch/case来模拟基于汇编指令跳转表的内存拷贝)。该部分反编译的结果:
; 18 : switch (len)
00016 8d 71 ff lea esi, DWORD PTR [ecx-1]
00019 03 c1 add eax, ecx
0001b 83 fe 43 cmp esi, 67 ; 00000043H
0001e 0f 87 fd 01 00
00 ja $LN2@memcpy_tin
00024 ff 24 b5 00 00
00 00 jmp DWORD PTR $LN77@memcpy_tin[esi*4]
$LN70@memcpy_tin:
; 19 : {
; 20 : case 68: *((int*)(dd - 68)) = *((int*)(ss - 68));
0002b 8b 74 0a bc mov esi, DWORD PTR [edx+ecx-68]
0002f 89 70 bc mov DWORD PTR [eax-68], esi
$LN69@memcpy_tin:
; 21 : case 64: *((int*)(dd - 64)) = *((int*)(ss - 64));
00032 8b 74 0a c0 mov esi, DWORD PTR [edx+ecx-64]
00036 89 70 c0 mov DWORD PTR [eax-64], esi
$LN68@memcpy_tin:
; 22 : case 60: *((int*)(dd - 60)) = *((int*)(ss - 60));
00039 8b 74 0a c4 mov esi, DWORD PTR [edx+ecx-60]
0003d 89 70 c4 mov DWORD PTR [eax-60], esi
$LN67@memcpy_tin:
; 23 : case 56: *((int*)(dd - 56)) = *((int*)(ss - 56));
00040 8b 74 0a c8 mov esi, DWORD PTR [edx+ecx-56]
00044 89 70 c8 mov DWORD PTR [eax-56], esi
$LN66@memcpy_tin:
.................................
$LN77@memcpy_tin:
; 105 : }
; 106 : }
0022c 00 00 00 00 DD $LN3@memcpy_tin
00230 00 00 00 00 DD $LN20@memcpy_tin
00234 00 00 00 00 DD $LN37@memcpy_tin
00238 00 00 00 00 DD $LN54@memcpy_tin
0023c 00 00 00 00 DD $LN4@memcpy_tin
00240 00 00 00 00 DD $LN21@memcpy_tin
00244 00 00 00 00 DD $LN38@memcpy_tin
00248 00 00 00 00 DD $LN55@memcpy_tin
0024c 00 00 00 00 DD $LN5@memcpy_tin
00250 00 00 00 00 DD $LN22@memcpy_tin
00254 00 00 00 00 DD $LN39@memcpy_tin
00258 00 00 00 00 DD $LN56@memcpy_tin
0025c 00 00 00 00 DD $LN6@memcpy_tin
00260 00 00 00 00 DD $LN23@memcpy_tin
00264 00 00 00 00 DD $LN40@memcpy_tin
00268 00 00 00 00 DD $LN57@memcpy_tin
0026c 00 00 00 00 DD $LN7@memcpy_tin
00270 00 00 00 00 DD $LN24@memcpy_tin
00274 00 00 00 00 DD $LN41@memcpy_tin
00278 00 00 00 00 DD $LN58@memcpy_tin
0027c 00 00 00 00 DD $LN8@memcpy_tin
00280 00 00 00 00 DD $LN25@memcpy_tin
00284 00 00 00 00 DD $LN42@memcpy_tin
00288 00 00 00 00 DD $LN59@memcpy_tin
大家发现诀窍没有?好了,其实在大部分情况下,比如脚本处理等,很多是64个字节以内的内存拷贝,所以小内存拷贝的优化是比较实在的,请大家继续关注“内存拷贝优化(2)-大内存拷贝优化”。
1170 total views, 1 view today
I like thisUnlike LikeI dislike thisUndislike 2
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK