9

内存拷贝优化(1)-小内存拷贝优化

 3 years ago
source link: https://www.skywind.me/blog/archives/143
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

内存拷贝优化(1)-小内存拷贝优化

相信大家代码里有很多地方用到memcpy这个函数,相信这个函数的占用是不小的,有时优化了memcpy,能使整个项目的运行效率提升。通过适当的编码技巧,让我们的内存拷贝速度超过memcpy两倍,是可以实现的。

有人说memcpy还能优化么?不就是rep movsd么?CPU和内存之间的带宽是固定的,怎么可能优化呢?其实是普通的内存拷贝并没有发挥全部的带宽,很多被浪费掉了,比如要等到数据完全读取成功后再去写入,然后要写入成功后再去读取新的。而优化本身就是使这两者尽量的并行。发挥最大的带宽。

现代的内存拷贝都需要判断内存大小,并按照大小选择不同策略进行拷贝,比如大内存拷贝(超过cache大小),那么最好使用并行若干读取指令和写入指令,然后再并行写入,使得CPU前后结果依赖得以大大降低,并且使用缓冲预取,再CPU处理数据之前,就把数据放到离CPU最近的CACHE。这样已经可以比memcpy快很多了,如果再加上一些新指令的帮助,大内存拷贝会大大提速。

但是用同样的代码去拷贝小内存,因为额外的开销,难对齐的内存,准备工作一大堆,如果实际要拷贝的内存很小的话,这样的工作开销还比直接按照 dword复制慢很多。在VC10的memcpy实现中将内存按照128个字节大小进行区分,小于该大小的使用普通拷贝,大于该大小的使用普通SSE指令拷贝,而现在我们就要来挑战VC10的memcpy,本篇先叙述小内存拷贝部分。

适合拷贝64字节以内的数据量。原理很简单,LOOP UNROLL。rep movsb/movsd是靠不住的,小内存拷贝还是得展开循环。废话不多说,代码贴上:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <windows.h>
#include <mmsystem.h>

#ifdef _MSC_VER
#pragma comment(lib, "winmm.lib")
#endif

void *memcpy_tiny(void *dst, const void *src, size_t len)
{
    if (len <= 63) {
        register unsigned char *dd = (unsigned char*)dst + len;
        register const unsigned char *ss = (const unsigned char*)src + len;
        switch (len)
        {
        case 68:	*((int*)(dd - 68)) = *((int*)(ss - 68));
        case 64:	*((int*)(dd - 64)) = *((int*)(ss - 64));
        case 60:	*((int*)(dd - 60)) = *((int*)(ss - 60));
        case 56:	*((int*)(dd - 56)) = *((int*)(ss - 56));
        case 52:	*((int*)(dd - 52)) = *((int*)(ss - 52));
        case 48:	*((int*)(dd - 48)) = *((int*)(ss - 48));
        case 44:	*((int*)(dd - 44)) = *((int*)(ss - 44));
        case 40:	*((int*)(dd - 40)) = *((int*)(ss - 40));
        case 36:	*((int*)(dd - 36)) = *((int*)(ss - 36));
        case 32:	*((int*)(dd - 32)) = *((int*)(ss - 32));
        case 28:	*((int*)(dd - 28)) = *((int*)(ss - 28));
        case 24:	*((int*)(dd - 24)) = *((int*)(ss - 24));
        case 20:	*((int*)(dd - 20)) = *((int*)(ss - 20));
        case 16:	*((int*)(dd - 16)) = *((int*)(ss - 16));
        case 12:	*((int*)(dd - 12)) = *((int*)(ss - 12));
        case  8:	*((int*)(dd - 8)) = *((int*)(ss - 8));
        case  4:	*((int*)(dd - 4)) = *((int*)(ss - 4));
                    break;
        case 67:    *((int*)(dd - 67)) = *((int*)(ss - 67));
        case 63:    *((int*)(dd - 63)) = *((int*)(ss - 63));
        case 59:    *((int*)(dd - 59)) = *((int*)(ss - 59));
        case 55:    *((int*)(dd - 55)) = *((int*)(ss - 55));
        case 51:    *((int*)(dd - 51)) = *((int*)(ss - 51));
        case 47:    *((int*)(dd - 47)) = *((int*)(ss - 47));
        case 43:    *((int*)(dd - 43)) = *((int*)(ss - 43));
        case 39:    *((int*)(dd - 39)) = *((int*)(ss - 39));
        case 35:    *((int*)(dd - 35)) = *((int*)(ss - 35));
        case 31:    *((int*)(dd - 31)) = *((int*)(ss - 31));
        case 27:    *((int*)(dd - 27)) = *((int*)(ss - 27));
        case 23:    *((int*)(dd - 23)) = *((int*)(ss - 23));
        case 19:    *((int*)(dd - 19)) = *((int*)(ss - 19));
        case 15:    *((int*)(dd - 15)) = *((int*)(ss - 15));
        case 11:    *((int*)(dd - 11)) = *((int*)(ss - 11));
        case  7:    *((int*)(dd - 7)) = *((int*)(ss - 7));
                    *((int*)(dd - 4)) = *((int*)(ss - 4));
                    break;
        case  3:    *((short*)(dd - 3)) = *((short*)(ss - 3));
                    dd[-1] = ss[-1];
                    break;
        case 66:    *((int*)(dd - 66)) = *((int*)(ss - 66));
        case 62:    *((int*)(dd - 62)) = *((int*)(ss - 62));
        case 58:    *((int*)(dd - 58)) = *((int*)(ss - 58));
        case 54:    *((int*)(dd - 54)) = *((int*)(ss - 54));
        case 50:    *((int*)(dd - 50)) = *((int*)(ss - 50));
        case 46:    *((int*)(dd - 46)) = *((int*)(ss - 46));
        case 42:    *((int*)(dd - 42)) = *((int*)(ss - 42));
        case 38:    *((int*)(dd - 38)) = *((int*)(ss - 38));
        case 34:    *((int*)(dd - 34)) = *((int*)(ss - 34));
        case 30:    *((int*)(dd - 30)) = *((int*)(ss - 30));
        case 26:    *((int*)(dd - 26)) = *((int*)(ss - 26));
        case 22:    *((int*)(dd - 22)) = *((int*)(ss - 22));
        case 18:    *((int*)(dd - 18)) = *((int*)(ss - 18));
        case 14:    *((int*)(dd - 14)) = *((int*)(ss - 14));
        case 10:    *((int*)(dd - 10)) = *((int*)(ss - 10));
        case  6:    *((int*)(dd - 6)) = *((int*)(ss - 6));
        case  2:    *((short*)(dd - 2)) = *((short*)(ss - 2));
                    break;
        case 65:    *((int*)(dd - 65)) = *((int*)(ss - 65));
        case 61:    *((int*)(dd - 61)) = *((int*)(ss - 61));
        case 57:    *((int*)(dd - 57)) = *((int*)(ss - 57));
        case 53:    *((int*)(dd - 53)) = *((int*)(ss - 53));
        case 49:    *((int*)(dd - 49)) = *((int*)(ss - 49));
        case 45:    *((int*)(dd - 45)) = *((int*)(ss - 45));
        case 41:    *((int*)(dd - 41)) = *((int*)(ss - 41));
        case 37:    *((int*)(dd - 37)) = *((int*)(ss - 37));
        case 33:    *((int*)(dd - 33)) = *((int*)(ss - 33));
        case 29:    *((int*)(dd - 29)) = *((int*)(ss - 29));
        case 25:    *((int*)(dd - 25)) = *((int*)(ss - 25));
        case 21:    *((int*)(dd - 21)) = *((int*)(ss - 21));
        case 17:    *((int*)(dd - 17)) = *((int*)(ss - 17));
        case 13:    *((int*)(dd - 13)) = *((int*)(ss - 13));
        case  9:    *((int*)(dd - 9)) = *((int*)(ss - 9));
        case  5:    *((int*)(dd - 5)) = *((int*)(ss - 5));
        case  1:    dd[-1] = ss[-1];
                    break;
        case 0:
        default: break;
        }
        return dd;
    }	else {
        return memcpy(dst, src, len);
    }
}

typedef void *(*MemCpyProc)(void *dst, const void *src, size_t size);

void benchmark(MemCpyProc memcpy1, MemCpyProc memcpy2, int x1, int x2)
{
    static unsigned char card1[0x20015];
    static unsigned char card2[0x20015];
    char *align1 = (char*)card1 + ((16 - ((size_t)card1 & 15)) & 15);
    char *align2 = (char*)card2 + ((16 - ((size_t)card2 & 15)) & 15);
    long totalsize = 0x20000000;
    int size, t1, t2, i;
    for (size = 64; size >= 8; size = size * 2 / 3) {
        int times = totalsize / size;
        printf("memcpy: size=%6d\t times=%8d:\t ", size, times);
        Sleep(50);
        t1 = timeGetTime();
        for (i = 0; i < times; i++)
            memcpy1(align1 + x1, align2 + x2, size);
        t1 = timeGetTime() - t1;
        Sleep(50);
        t2 = timeGetTime();
        for (i = 0; i < times; i++)
            memcpy2(align1 + x1, align2 + x2, size);
        t2 = timeGetTime() - t2;
        printf("t1=%4d\t t2=%4d\n", t1, t2);
    }
}
//! flag: -O3
//! exe:
int main(void)
{
    printf("\ndst: aligned  src: aligned\n");
    benchmark(memcpy, memcpy_tiny, 0, 0);
    printf("\ndst: un-aligned  src: aligned\n");
    benchmark(memcpy, memcpy_tiny, 1, 0);
    printf("\ndst: aligned  src: un-aligned\n");
    benchmark(memcpy, memcpy_tiny, 0, 1);
    printf("\ndst: un-aligned  src: un-aligned\n");
    benchmark(memcpy, memcpy_tiny, 3, 1);
    return 0;
}

该代码展开了循环,效率比rep movsd之类的高2-3倍,比memcpy平均快2倍,下面是测试数据,t1代表memcpy, t2代表memcpy_tiny,使用VC2010最大优化模式编译:

dst: aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 239 t2= 243
memcpy: size= 42 times=12782640: t1= 341 t2= 170
memcpy: size= 28 times=19173961: t1= 225 t2= 205
memcpy: size= 18 times=29826161: t1= 363 t2= 232
memcpy: size= 12 times=44739242: t1= 448 t2= 294
memcpy: size= 8 times=67108864: t1= 699 t2= 406

dst: un-aligned src: aligned
memcpy: size= 64 times= 8388608: t1= 255 t2= 256
memcpy: size= 42 times=12782640: t1= 369 t2= 269
memcpy: size= 28 times=19173961: t1= 289 t2= 165
memcpy: size= 18 times=29826161: t1= 421 t2= 203
memcpy: size= 12 times=44739242: t1= 493 t2= 279
memcpy: size= 8 times=67108864: t1= 956 t2= 354

dst: aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 241 t2= 257
memcpy: size= 42 times=12782640: t1= 303 t2= 218
memcpy: size= 28 times=19173961: t1= 258 t2= 181
memcpy: size= 18 times=29826161: t1= 319 t2= 261
memcpy: size= 12 times=44739242: t1= 448 t2= 325
memcpy: size= 8 times=67108864: t1= 646 t2= 401

dst: un-aligned src: un-aligned
memcpy: size= 64 times= 8388608: t1= 243 t2= 257
memcpy: size= 42 times=12782640: t1= 329 t2= 273
memcpy: size= 28 times=19173961: t1= 276 t2= 193
memcpy: size= 18 times=29826161: t1= 336 t2= 249
memcpy: size= 12 times=44739242: t1= 529 t2= 332
memcpy: size= 8 times=67108864: t1= 915 t2= 368

可见平均在64,42,28,18,12,8等几个大小的测试中,平均速度是memcpy的两倍。主要是依靠跳转表来实现的(VC INLINE ASM不能描述跳转表,因此该部分无法写成inline asm模式,索幸switch /case在大部分C编译器中都是用跳转表来实现的,因此我们可以用switch/case来模拟基于汇编指令跳转表的内存拷贝)。该部分反编译的结果:

; 18   : 		switch (len)
  00016	8d 71 ff	 lea	 esi, DWORD PTR [ecx-1]
  00019	03 c1		 add	 eax, ecx
  0001b	83 fe 43	 cmp	 esi, 67			; 00000043H
  0001e	0f 87 fd 01 00
    00		 ja	 $LN2@memcpy_tin
  00024	ff 24 b5 00 00
    00 00		 jmp	 DWORD PTR $LN77@memcpy_tin[esi*4]
$LN70@memcpy_tin:
; 19   : 		{
; 20   : 		case 68:	*((int*)(dd - 68)) = *((int*)(ss - 68));
  0002b	8b 74 0a bc	 mov	 esi, DWORD PTR [edx+ecx-68]
  0002f	89 70 bc	 mov	 DWORD PTR [eax-68], esi
$LN69@memcpy_tin:
; 21   : 		case 64:	*((int*)(dd - 64)) = *((int*)(ss - 64));
  00032	8b 74 0a c0	 mov	 esi, DWORD PTR [edx+ecx-64]
  00036	89 70 c0	 mov	 DWORD PTR [eax-64], esi
$LN68@memcpy_tin:
; 22   : 		case 60:	*((int*)(dd - 60)) = *((int*)(ss - 60));
  00039	8b 74 0a c4	 mov	 esi, DWORD PTR [edx+ecx-60]
  0003d	89 70 c4	 mov	 DWORD PTR [eax-60], esi
$LN67@memcpy_tin:
; 23   : 		case 56:	*((int*)(dd - 56)) = *((int*)(ss - 56));
  00040	8b 74 0a c8	 mov	 esi, DWORD PTR [edx+ecx-56]
  00044	89 70 c8	 mov	 DWORD PTR [eax-56], esi
$LN66@memcpy_tin:

.................................
$LN77@memcpy_tin:
; 105  : 	}
; 106  : }
  0022c	00 00 00 00	 DD	 $LN3@memcpy_tin
  00230	00 00 00 00	 DD	 $LN20@memcpy_tin
  00234	00 00 00 00	 DD	 $LN37@memcpy_tin
  00238	00 00 00 00	 DD	 $LN54@memcpy_tin
  0023c	00 00 00 00	 DD	 $LN4@memcpy_tin
  00240	00 00 00 00	 DD	 $LN21@memcpy_tin
  00244	00 00 00 00	 DD	 $LN38@memcpy_tin
  00248	00 00 00 00	 DD	 $LN55@memcpy_tin
  0024c	00 00 00 00	 DD	 $LN5@memcpy_tin
  00250	00 00 00 00	 DD	 $LN22@memcpy_tin
  00254	00 00 00 00	 DD	 $LN39@memcpy_tin
  00258	00 00 00 00	 DD	 $LN56@memcpy_tin
  0025c	00 00 00 00	 DD	 $LN6@memcpy_tin
  00260	00 00 00 00	 DD	 $LN23@memcpy_tin
  00264	00 00 00 00	 DD	 $LN40@memcpy_tin
  00268	00 00 00 00	 DD	 $LN57@memcpy_tin
  0026c	00 00 00 00	 DD	 $LN7@memcpy_tin
  00270	00 00 00 00	 DD	 $LN24@memcpy_tin
  00274	00 00 00 00	 DD	 $LN41@memcpy_tin
  00278	00 00 00 00	 DD	 $LN58@memcpy_tin
  0027c	00 00 00 00	 DD	 $LN8@memcpy_tin
  00280	00 00 00 00	 DD	 $LN25@memcpy_tin
  00284	00 00 00 00	 DD	 $LN42@memcpy_tin
  00288	00 00 00 00	 DD	 $LN59@memcpy_tin

大家发现诀窍没有?好了,其实在大部分情况下,比如脚本处理等,很多是64个字节以内的内存拷贝,所以小内存拷贝的优化是比较实在的,请大家继续关注“内存拷贝优化(2)-大内存拷贝优化”。

1170 total views, 1 view today

I like thisUnlike LikeI dislike thisUndislike 2


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK