141 lines
3.5 KiB
NASM
Executable File
141 lines
3.5 KiB
NASM
Executable File
; NASM assembly language code for PAQ7.
|
|
; (C) 2005, Matt Mahoney.
|
|
; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
|
|
;
|
|
; MINGW g++: nasm paq7asm.asm -f win32 --prefix _
|
|
; DJGPP g++: nasm paq7asm.asm -f coff --prefix _
|
|
; Borland, Mars: nasm paq7asm.asm -f obj --prefix _
|
|
; Linux: nasm paq7asm.asm -f elf
|
|
;
|
|
; For other Windows compilers try -f win32 or -f obj. Some old versions
|
|
; of Linux should use -f aout instead of -f elf.
|
|
;
|
|
; This code will only work on a Pentium-MMX or higher. It doesn't
|
|
; use extended (Katmai/SSE) instructions. It won't work
|
|
; in 64-bit mode.
|
|
|
|
section .text use32 class=CODE
|
|
|
|
; Reset after MMX
|
|
global do_emms
|
|
do_emms:
|
|
emms
|
|
ret
|
|
|
|
; Vector product a*b of n signed words, returning signed dword scaled
|
|
; down by 8 bits. n is rounded up to a multiple of 8.
|
|
|
|
global dot_product ; (short* a, short* b, int n)
|
|
align 16
|
|
dot_product:
|
|
mov eax, [esp+4] ; a
|
|
mov edx, [esp+8] ; b
|
|
mov ecx, [esp+12] ; n
|
|
add ecx, 7 ; n rounding up
|
|
and ecx, -8
|
|
jz .done
|
|
sub eax, 8
|
|
sub edx, 8
|
|
pxor mm0, mm0 ; sum = 0
|
|
.loop: ; each loop sums 4 products
|
|
movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
|
|
pmaddwd mm1, [edx+ecx*2]
|
|
movq mm2, [eax+ecx*2-8]
|
|
pmaddwd mm2, [edx+ecx*2-8]
|
|
psrad mm1, 8
|
|
psrad mm2, 8
|
|
paddd mm0, mm1
|
|
paddd mm0, mm2
|
|
sub ecx, 8
|
|
ja .loop
|
|
movq mm1, mm0 ; add 2 halves of mm0 and return in eax
|
|
psrlq mm1, 32
|
|
paddd mm0, mm1
|
|
movd eax, mm0
|
|
emms
|
|
.done
|
|
ret
|
|
|
|
; This should work on a Pentium 4 or higher in 32-bit mode,
|
|
; but it isn't much faster than the MMX version so I don't use it.
|
|
|
|
global dot_product_sse2 ; (short* a, short* b, int n)
|
|
align 16
|
|
dot_product_sse2:
|
|
mov eax, [esp+4] ; a
|
|
mov edx, [esp+8] ; b
|
|
mov ecx, [esp+12] ; n
|
|
add ecx, 7 ; n rounding up
|
|
and ecx, -8
|
|
jz .done
|
|
sub eax, 16
|
|
sub edx, 16
|
|
pxor xmm0, xmm0 ; sum = 0
|
|
.loop: ; each loop sums 4 products
|
|
movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
|
|
pmaddwd xmm1, [edx+ecx*2]
|
|
psrad xmm1, 8
|
|
paddd xmm0, xmm1
|
|
sub ecx, 8
|
|
ja .loop
|
|
movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax
|
|
psrldq xmm1, 8
|
|
paddd xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
psrldq xmm1, 4
|
|
paddd xmm0, xmm1
|
|
movd eax, xmm0
|
|
.done
|
|
ret
|
|
|
|
|
|
; Train n neural network weights w[n] on inputs t[n] and err.
|
|
; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
|
|
; n is rounded up to a multiple of 8.
|
|
|
|
global train ; (short* t, short* w, int n, int err)
|
|
align 16
|
|
train:
|
|
mov eax, [esp+16] ; err
|
|
and eax, 0xffff ; put 4 copies of err in mm0
|
|
movd mm0, eax
|
|
movd mm1, eax
|
|
psllq mm1, 16
|
|
por mm0, mm1
|
|
movq mm1, mm0
|
|
psllq mm1, 32
|
|
por mm0, mm1
|
|
pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1
|
|
psrlw mm1, 15
|
|
mov eax, [esp+4] ; t
|
|
mov edx, [esp+8] ; w
|
|
mov ecx, [esp+12] ; n
|
|
add ecx, 7 ; n/8 rounding up
|
|
and ecx, -8
|
|
sub eax, 8
|
|
sub edx, 8
|
|
jz .done
|
|
.loop: ; each iteration adjusts 8 weights
|
|
movq mm2, [edx+ecx*2] ; w[i]
|
|
movq mm3, [eax+ecx*2] ; t[i]
|
|
movq mm4, [edx+ecx*2-8] ; w[i]
|
|
movq mm5, [eax+ecx*2-8] ; t[i]
|
|
paddsw mm3, mm3
|
|
paddsw mm5, mm5
|
|
pmulhw mm3, mm0
|
|
pmulhw mm5, mm0
|
|
paddsw mm3, mm1
|
|
paddsw mm5, mm1
|
|
psraw mm3, 1
|
|
psraw mm5, 1
|
|
paddsw mm2, mm3
|
|
paddsw mm4, mm5
|
|
movq [edx+ecx*2], mm2
|
|
movq [edx+ecx*2-8], mm4
|
|
sub ecx, 8
|
|
ja .loop
|
|
.done:
|
|
emms
|
|
ret
|
|
|