# SIMD SSE2 dot product # Equivalent to the following C code: # long dotprod(signed short *a,signed short *b,int cnt) # { # long sum = 0; # cnt *= 8; # while(cnt--) # sum += *a++ + *b++; # return sum; # } # a and b must be 128-bit aligned # Copyright 2001, Phil Karn KA9Q # May be used under the terms of the GNU Lesser General Public License (LGPL) .text .global dotprod_sse2_assist .type dotprod_sse2_assist,@function dotprod_sse2_assist: pushl %ebp movl %esp,%ebp pushl %esi pushl %edi pushl %ecx pushl %ebx movl 8(%ebp),%esi # a movl 12(%ebp),%edi # b movl 16(%ebp),%ecx # cnt pxor %xmm0,%xmm0 # clear running sum (in two 32-bit halves) # SSE2 dot product loop unrolled 4 times, crunching 32 terms per loop .align 16 .Loop1: subl $4,%ecx jl .Loop1Done movdqa (%esi),%xmm1 pmaddwd (%edi),%xmm1 paddd %xmm1,%xmm0 movdqa 16(%esi),%xmm1 pmaddwd 16(%edi),%xmm1 paddd %xmm1,%xmm0 movdqa 32(%esi),%xmm1 pmaddwd 32(%edi),%xmm1 paddd %xmm1,%xmm0 movdqa 48(%esi),%xmm1 addl $64,%esi pmaddwd 48(%edi),%xmm1 addl $64,%edi paddd %xmm1,%xmm0 jmp .Loop1 .Loop1Done: addl $4,%ecx # SSE2 dot product loop, not unrolled, crunching 4 terms per loop # This could be redone as Duff's Device on the unrolled loop above .Loop2: subl $1,%ecx jl .Loop2Done movdqa (%esi),%xmm1 addl $16,%esi pmaddwd (%edi),%xmm1 addl $16,%edi paddd %xmm1,%xmm0 jmp .Loop2 .Loop2Done: movdqa %xmm0,%xmm1 psrldq $8,%xmm0 paddd %xmm1,%xmm0 movd %xmm0,%eax # right-hand word to eax psrldq $4,%xmm0 movd %xmm0,%ebx addl %ebx,%eax popl %ebx popl %ecx popl %edi popl %esi movl %ebp,%esp popl %ebp ret