aboutsummaryrefslogtreecommitdiff
path: root/mmxbfly27.s
blob: 4abbf482c1eeb37cd63dd8f3aaaa6f12ca12d5af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* Intel SIMD MMX implementation of Viterbi ACS butterflies
   for 64-state (k=7) convolutional code
   Copyright 2004 Phil Karn, KA9Q
   This code may be used under the terms of the GNU Lesser General Public License (LGPL)

   int update_viterbi27_blk_mmx(struct v27 *vp,unsigned char *syms,int nbits) ; 
*/
	# MMX (64-bit SIMD) version
	# requires Pentium-MMX, Pentium-II or better

	# These are offsets into struct v27, defined in viterbi27_mmx.c
	.set DP,128
	.set OLDMETRICS,132
	.set NEWMETRICS,136
	.text	
	.global update_viterbi27_blk_mmx,Mettab27_1,Mettab27_2
	.type update_viterbi27_blk_mmx,@function
	.align 16
	
update_viterbi27_blk_mmx:
	pushl %ebp
	movl %esp,%ebp
	pushl %esi
	pushl %edi
	pushl %edx
	pushl %ebx
	
	movl 8(%ebp),%edx	# edx = vp
	testl %edx,%edx
	jnz  0f
	movl -1,%eax
	jmp  err		
0:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
	movl DP(%edx),%edx	# edx -> decisions

1:	movl 16(%ebp),%eax	# eax = nbits
	decl %eax
	jl   2f			# passed zero, we're done
	movl %eax,16(%ebp)

	movl 12(%ebp),%ebx	# ebx = syms
	movw (%ebx),%ax		# ax = second symbol : first symbol
	addl $2,%ebx
	movl %ebx,12(%ebp)

	movb %ah,%bl
	andl $255,%eax
	andl $255,%ebx

	# shift into first array index dimension slot
	shll $5,%eax
	shll $5,%ebx

	# each invocation of this macro will do 8 butterflies in parallel
	.MACRO butterfly GROUP
	# Compute branch metrics
	movq (Mettab27_1+8*\GROUP)(%eax),%mm3
	movq fifteens,%mm0	

	paddb (Mettab27_2+8*\GROUP)(%ebx),%mm3
	paddb ones,%mm3  # emulate pavgb - this may not be necessary
	psrlq $1,%mm3
	pand %mm0,%mm3

	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
	movq ((8*\GROUP)+32)(%esi),%mm2 # Incoming path metric, high bit = 1
	movq %mm6,%mm1	
	movq %mm2,%mm7
	
	paddb %mm3,%mm6
	paddb %mm3,%mm2
	pxor  %mm0,%mm3		 # invert branch metric
	paddb %mm3,%mm7		 # path metric for inverted symbols
	paddb %mm3,%mm1

	# live registers 1 2 6 7
	# Compare mm6 and mm7;  mm1 and mm2
	pxor %mm3,%mm3	
	movq %mm6,%mm4
	movq %mm1,%mm5	
	psubb %mm7,%mm4		# mm4 = mm6 - mm7
	psubb %mm2,%mm5		# mm5 = mm1 - mm2
	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions		

	# live registers 1 2 4 5 6 7
	# select survivors
	movq %mm4,%mm0
	pand %mm4,%mm7	
	movq %mm5,%mm3	
	pand %mm5,%mm2	
	pandn %mm6,%mm0
	pandn %mm1,%mm3	
	por %mm0,%mm7		# mm7 = first set of survivors
	por %mm3,%mm2		# mm2 = second set of survivors	

	# live registers 2 4 5 7
	# interleave & store decisions in mm4, mm5
	# interleave & store new branch metrics in mm2, mm7		
	movq %mm4,%mm3
	movq %mm7,%mm0	
	punpckhbw %mm5,%mm4
	punpcklbw %mm5,%mm3
	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
	movq %mm4,(16*\GROUP+8)(%edx)
	movq %mm3,(16*\GROUP)(%edx)
	movq %mm7,(16*\GROUP)(%edi)
	movq %mm0,(16*\GROUP+8)(%edi)	

	.endm

# invoke macro 4 times for a total of 32 butterflies
	butterfly GROUP=0
	butterfly GROUP=1
	butterfly GROUP=2
	butterfly GROUP=3

	addl $64,%edx		# bump decision pointer			

	# swap metrics
	movl %esi,%eax
	movl %edi,%esi
	movl %eax,%edi
	jmp 1b

2:	emms
	movl 8(%ebp),%ebx	# ebx = vp
	# stash metric pointers
	movl %esi,OLDMETRICS(%ebx)
	movl %edi,NEWMETRICS(%ebx)
	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
	xorl %eax,%eax
err:	popl %ebx
	popl %edx
	popl %edi
	popl %esi
	popl %ebp
	ret

	.data
	.align 8
fifteens:	
	.byte 15,15,15,15,15,15,15,15
	
	.align 8
ones:	.byte 1,1,1,1,1,1,1,1