; NASM assembly language code for PAQ7. ; (C) 2005, Matt Mahoney. ; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt ; ; MINGW g++: nasm paq7asm.asm -f win32 --prefix _ ; DJGPP g++: nasm paq7asm.asm -f coff --prefix _ ; Borland, Mars: nasm paq7asm.asm -f obj --prefix _ ; Linux: nasm paq7asm.asm -f elf ; ; For other Windows compilers try -f win32 or -f obj. Some old versions ; of Linux should use -f aout instead of -f elf. ; ; This code will only work on a Pentium-MMX or higher. It doesn't ; use extended (Katmai/SSE) instructions. It won't work ; in 64-bit mode. section .text use32 class=CODE ; Reset after MMX global do_emms do_emms: emms ret ; Vector product a*b of n signed words, returning signed dword scaled ; down by 8 bits. n is rounded up to a multiple of 8. global dot_product ; (short* a, short* b, int n) align 16 dot_product: mov eax, [esp+4] ; a mov edx, [esp+8] ; b mov ecx, [esp+12] ; n add ecx, 7 ; n rounding up and ecx, -8 jz .done sub eax, 8 sub edx, 8 pxor mm0, mm0 ; sum = 0 .loop: ; each loop sums 4 products movq mm1, [eax+ecx*2] ; put halves of vector product in mm0 pmaddwd mm1, [edx+ecx*2] movq mm2, [eax+ecx*2-8] pmaddwd mm2, [edx+ecx*2-8] psrad mm1, 8 psrad mm2, 8 paddd mm0, mm1 paddd mm0, mm2 sub ecx, 8 ja .loop movq mm1, mm0 ; add 2 halves of mm0 and return in eax psrlq mm1, 32 paddd mm0, mm1 movd eax, mm0 emms .done ret ; This should work on a Pentium 4 or higher in 32-bit mode, ; but it isn't much faster than the MMX version so I don't use it. global dot_product_sse2 ; (short* a, short* b, int n) align 16 dot_product_sse2: mov eax, [esp+4] ; a mov edx, [esp+8] ; b mov ecx, [esp+12] ; n add ecx, 7 ; n rounding up and ecx, -8 jz .done sub eax, 16 sub edx, 16 pxor xmm0, xmm0 ; sum = 0 .loop: ; each loop sums 4 products movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0 pmaddwd xmm1, [edx+ecx*2] psrad xmm1, 8 paddd xmm0, xmm1 sub ecx, 8 ja .loop movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax psrldq xmm1, 8 paddd xmm0, xmm1 movdqa xmm1, xmm0 psrldq xmm1, 4 paddd xmm0, xmm1 movd eax, xmm0 .done ret ; Train n neural network weights w[n] on inputs t[n] and err. ; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K. ; n is rounded up to a multiple of 8. global train ; (short* t, short* w, int n, int err) align 16 train: mov eax, [esp+16] ; err and eax, 0xffff ; put 4 copies of err in mm0 movd mm0, eax movd mm1, eax psllq mm1, 16 por mm0, mm1 movq mm1, mm0 psllq mm1, 32 por mm0, mm1 pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1 psrlw mm1, 15 mov eax, [esp+4] ; t mov edx, [esp+8] ; w mov ecx, [esp+12] ; n add ecx, 7 ; n/8 rounding up and ecx, -8 sub eax, 8 sub edx, 8 jz .done .loop: ; each iteration adjusts 8 weights movq mm2, [edx+ecx*2] ; w[i] movq mm3, [eax+ecx*2] ; t[i] movq mm4, [edx+ecx*2-8] ; w[i] movq mm5, [eax+ecx*2-8] ; t[i] paddsw mm3, mm3 paddsw mm5, mm5 pmulhw mm3, mm0 pmulhw mm5, mm0 paddsw mm3, mm1 paddsw mm5, mm1 psraw mm3, 1 psraw mm5, 1 paddsw mm2, mm3 paddsw mm4, mm5 movq [edx+ecx*2], mm2 movq [edx+ecx*2-8], mm4 sub ecx, 8 ja .loop .done: emms ret