; NASM assembly language code for PAQ7. ; (C) 2005, Matt Mahoney. ; train - written by wowtiger, Jan. 30, 2007 ; ; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt ; ; This code is a replacement for paq7asm.asm for newer processors ; supporting SSE2 instructions. It is about 1% faster than the ; equivalent MMX code. It can be linked with any version of paq7* ; or paq8*. Assemble as below, then link following the instructions ; in the C++ source code, replacing paq7asm.obj with paq7asmsse.obj. ; No C++ code changes are needed. ; ; MINGW g++: nasm paq7asmsse.asm -f win32 --prefix _ ; DJGPP g++: nasm paq7asmsse.asm -f coff --prefix _ ; Borland, Mars: nasm paq7asmsse.asm -f obj --prefix _ ; Linux: nasm paq7asmsse.asm -f elf ; section .text use32 class=CODE ; Vector product a*b of n signed words, returning signed dword scaled ; down by 8 bits. n is rounded up to a multiple of 8. global dot_product ; (short* a, short* b, int n) align 16 dot_product: mov eax, [esp+4] ; a mov edx, [esp+8] ; b mov ecx, [esp+12] ; n add ecx, 7 ; n rounding up and ecx, -8 jz .done sub eax, 16 sub edx, 16 pxor xmm0, xmm0 ; sum = 0 .loop: ; each loop sums 4 products movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 pmaddwd xmm1, [edx+ecx*2] psrad xmm1, 8 paddd xmm0, xmm1 sub ecx, 8 ja .loop movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax psrldq xmm1, 8 paddd xmm0, xmm1 movdqa xmm1, xmm0 psrldq xmm1, 4 paddd xmm0, xmm1 movd eax, xmm0 .done ret ; Train n neural network weights w[n] on inputs t[n] and err. ; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. ; n is rounded up to a multiple of 8. ; Train for SSE2 ; Use this code to get some performance... global train ; (short* t, short* w, int n, int err) align 16 train: mov eax, [esp+4] ; t mov edx, [esp+8] ; w mov ecx, [esp+12] ; n add ecx, 7 ; n/8 rounding up and ecx, -8 jz .done sub eax, 16 sub edx, 16 movd xmm0, [esp+16] pshuflw xmm0,xmm0,0 punpcklqdq xmm0,xmm0 .loop: ; each iteration adjusts 8 weights movdqa xmm3, [eax+ecx*2] ; t[i] movdqa xmm2, [edx+ecx*2] ; w[i] paddsw xmm3, xmm3 ; t[i]*2 pmulhw xmm3, xmm0 ; t[i]*err*2 >> 16 paddsw xmm3, [_mask] ; (t[i]*err*2 >> 16)+1 psraw xmm3, 1 ; (t[i]*err*2 >> 16)+1 >> 1 paddsw xmm2, xmm3 ; w[i] + xmm3 movdqa [edx+ecx*2], xmm2 sub ecx, 8 ja .loop .done: ret align 16 _mask dd 10001h,10001h,10001h,10001h ; 8 copies of 1 in xmm1