94 lines
2.7 KiB
NASM
Executable File
94 lines
2.7 KiB
NASM
Executable File
; NASM assembly language code for PAQ7.
|
|
; (C) 2005, Matt Mahoney.
|
|
; train - written by wowtiger, Jan. 30, 2007
|
|
;
|
|
; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
|
|
;
|
|
; This code is a replacement for paq7asm.asm for newer processors
|
|
; supporting SSE2 instructions. It is about 1% faster than the
|
|
; equivalent MMX code. It can be linked with any version of paq7*
|
|
; or paq8*. Assemble as below, then link following the instructions
|
|
; in the C++ source code, replacing paq7asm.obj with paq7asmsse.obj.
|
|
; No C++ code changes are needed.
|
|
;
|
|
; MINGW g++: nasm paq7asmsse.asm -f win32 --prefix _
|
|
; DJGPP g++: nasm paq7asmsse.asm -f coff --prefix _
|
|
; Borland, Mars: nasm paq7asmsse.asm -f obj --prefix _
|
|
; Linux: nasm paq7asmsse.asm -f elf
|
|
;
|
|
|
|
section .text use32 class=CODE
|
|
|
|
; Vector product a*b of n signed words, returning signed dword scaled
|
|
; down by 8 bits. n is rounded up to a multiple of 8.
|
|
|
|
global dot_product ; (short* a, short* b, int n)
|
|
align 16
|
|
dot_product:
|
|
mov eax, [esp+4] ; a
|
|
mov edx, [esp+8] ; b
|
|
mov ecx, [esp+12] ; n
|
|
add ecx, 7 ; n rounding up
|
|
and ecx, -8
|
|
jz .done
|
|
sub eax, 16
|
|
sub edx, 16
|
|
pxor xmm0, xmm0 ; sum = 0
|
|
.loop: ; each loop sums 4 products
|
|
movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
|
|
pmaddwd xmm1, [edx+ecx*2]
|
|
psrad xmm1, 8
|
|
paddd xmm0, xmm1
|
|
sub ecx, 8
|
|
ja .loop
|
|
movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax
|
|
psrldq xmm1, 8
|
|
paddd xmm0, xmm1
|
|
movdqa xmm1, xmm0
|
|
psrldq xmm1, 4
|
|
paddd xmm0, xmm1
|
|
movd eax, xmm0
|
|
.done
|
|
ret
|
|
|
|
|
|
; Train n neural network weights w[n] on inputs t[n] and err.
|
|
; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
|
|
; n is rounded up to a multiple of 8.
|
|
|
|
; Train for SSE2
|
|
; Use this code to get some performance...
|
|
|
|
global train ; (short* t, short* w, int n, int err)
|
|
align 16
|
|
train:
|
|
mov eax, [esp+4] ; t
|
|
mov edx, [esp+8] ; w
|
|
mov ecx, [esp+12] ; n
|
|
add ecx, 7 ; n/8 rounding up
|
|
and ecx, -8
|
|
jz .done
|
|
sub eax, 16
|
|
sub edx, 16
|
|
movd xmm0, [esp+16]
|
|
pshuflw xmm0,xmm0,0
|
|
punpcklqdq xmm0,xmm0
|
|
.loop: ; each iteration adjusts 8 weights
|
|
movdqa xmm3, [eax+ecx*2] ; t[i]
|
|
movdqa xmm2, [edx+ecx*2] ; w[i]
|
|
paddsw xmm3, xmm3 ; t[i]*2
|
|
pmulhw xmm3, xmm0 ; t[i]*err*2 >> 16
|
|
paddsw xmm3, [_mask] ; (t[i]*err*2 >> 16)+1
|
|
psraw xmm3, 1 ; (t[i]*err*2 >> 16)+1 >> 1
|
|
paddsw xmm2, xmm3 ; w[i] + xmm3
|
|
movdqa [edx+ecx*2], xmm2
|
|
sub ecx, 8
|
|
ja .loop
|
|
.done:
|
|
ret
|
|
|
|
align 16
|
|
_mask dd 10001h,10001h,10001h,10001h ; 8 copies of 1 in xmm1
|
|
|
|
|