cpu_ref/rsCpuIntrinsics_advsimd_3DLUT.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;


.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1

            smov        x6, \src0
            smov        x7, \src1

            add         x6, x6, x3
            add         x7, x7, x3

            ld1         {v16.2s}, [x6], x4
            ld1         {v17.2s}, [x7], x4

            ld1         {v18.2s}, [x6], x5
            ld1         {v19.2s}, [x7], x5

            dup         v8.8b, \yr0
            dup         v9.8b, \yr1
            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
            zip1        v12.16b, v5.16b, v16.16b
            zip1        v13.16b, v5.16b, v17.16b
            umlsl       v12.8h, v16.8b, v8.8b
            umlsl       v13.8h, v17.8b, v9.8b
            umlal       v12.8h, v18.8b, v8.8b
            umlal       v13.8h, v19.8b, v9.8b

            ld1         {v18.2s}, [x6]
            ld1         {v19.2s}, [x7]

            sub         x6, x6, x4
            sub         x7, x7, x4

            ld1         {v16.2s}, [x6]
            ld1         {v17.2s}, [x7]

            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
            zip1        v14.16b, v5.16b, v16.16b
            zip1        v15.16b, v5.16b, v17.16b
            umlsl       v14.8h, v16.8b, v8.8b
            umlsl       v15.8h, v17.8b, v9.8b
            umlal       v14.8h, v18.8b, v8.8b
            umlal       v15.8h, v19.8b, v9.8b

            /* Z interpolate, lane 0 v12/v14 -> v10 */
            ushll       v8.4s, v12.4h, #8
            ushll2      v9.4s, v12.8h, #8
            umlsl       v8.4s, v12.4h, \zr0
            umlsl2      v9.4s, v12.8h, \zr0
            umlal       v8.4s, v14.4h, \zr0
            umlal2      v9.4s, v14.8h, \zr0
            rshrn       v10.4h, v8.4s, #8
            rshrn2      v10.8h, v9.4s, #8

            /* Z interpolate, lane 1 v13/v15 -> v11 */
            ushll       v8.4s, v13.4h, #8
            ushll2      v9.4s, v13.8h, #8
            umlsl       v8.4s, v13.4h, \zr1
            umlsl2      v9.4s, v13.8h, \zr1
            umlal       v8.4s, v15.4h, \zr1
            umlal2      v9.4s, v15.8h, \zr1
            rshrn       v11.4h, v8.4s, #8
            rshrn2      v11.8h, v9.4s, #8

            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
            ushll       v8.4s, v10.4h, #8
            ushll       v9.4s, v11.4h, #8
            umlsl       v8.4s, v10.4h, \xr0
            umlsl       v9.4s, v11.4h, \xr1
            umlal2      v8.4s, v10.8h, \xr0
            umlal2      v9.4s, v11.8h, \xr1
            shrn        v14.4h, v8.4s, #8
            shrn2       v14.8h, v9.4s, #8

            /* pack lanes 0-1 -> v6 */
.ifc \dst, v20.16b
            uqrshrn2    \dst, v14.8h, #8
.else ; .ifc \dst, v21.16b
            uqrshrn2    \dst, v14.8h, #8
.else
            uqrshrn     \dst, v14.8h, #8
.endif ; .endif
.endm

/* void rsdIntrinsic3DLUT_K(
 *          void *dst,          // x0
 *          void const *in,     // x1
 *          size_t count,       // x2
 *          void const *lut,    // x3
 *          int32_t pitchy,     // w4
 *          int32_t pitchz,     // w5
 *          int dimx,           // w6
 *          int dimy,           // w7
 *          int dimz);          // [sp]
 */
ENTRY(rsdIntrinsic3DLUT_K)
            ldr         w8, [sp]
            stp         d8, d9, [sp, #-64]!
            stp         d10, d11, [sp, #16]
            stp         d12, d13, [sp, #32]
            stp         d14, d15, [sp, #48]
            movi        v4.8b, #1
            ins         v4.h[0], w6
            ins         v4.h[1], w7
            ins         v4.h[2], w8
            ins         v4.s[2], w4
            ins         v4.s[3], w5
            movi        v5.16b, #0

            subs        x2, x2, #8
            bge         2f
            cmp         x2, #-8
            ble         9f
            b           4f

            .align 6
1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
/* x0  = dst
 * x1  = src
 * x2  = count
 * x3  = lut
 * x4  = pitchy
 * x5  = pitchz
 * x6 = offset0
 * x7 = offset1
 */
2:          ld4         {v0.8b-v3.8b}, [x1], #32
/* v0,v1,v2,v3 source data
 * v4 dimensions and pitches
 */
3:          uxtl        v0.8h, v0.8b
            uxtl        v1.8h, v1.8b
            uxtl        v2.8h, v2.8b
            mul         v0.8h, v0.8h, v4.h[0]
            mul         v1.8h, v1.8h, v4.h[1]
            mul         v2.8h, v2.8h, v4.h[2]

/* ursra below would be more accurate, but this can result in a dim.0 case
 * where we try to read from the limit of the array and the limit +1 to
 * interpolate, even though the fractional component is zero.  Strictly this is
 * correct, except for the llegal access problem.
 */
            usra        v0.8h, v0.8h, #8
            usra        v1.8h, v1.8h, #8
            usra        v2.8h, v2.8h, #8

            ushr        v12.8h, v0.8h, #8
            ushr        v13.8h, v1.8h, #8
            ushr        v14.8h, v2.8h, #8
            bic         v0.8h, #0xff, LSL #8
            xtn         v1.8b, v1.8h
            bic         v2.8h, #0xff, LSL #8

/* v0.8h,v1.8b,v2.hb fractional offset
 * v12.8h,v13.8h,v14.8h integer offset
 */

            ushll       v6.4s, v12.4h, #2
            ushll2      v7.4s, v12.8h, #2
            uxtl        v8.4s, v13.4h
            uxtl2       v9.4s, v13.8h
            uxtl        v10.4s, v14.4h
            uxtl2       v11.4s, v14.8h
            mla         v6.4s, v8.4s,  v4.s[2]
            mla         v7.4s, v9.4s,  v4.s[2]
            mla         v6.4s, v10.4s, v4.s[3]
            mla         v7.4s, v11.4s, v4.s[3]

/* v6,v7 list of table offsets */

        /* lanes 0 and 1 */
            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]

        /* lanes 2 and 3 */
            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]

        /* lanes 4 and 5 */
            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]

        /* lanes 6 and 7 */
            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]

            uzp1        v6.16b, v20.16b, v21.16b
            uzp2        v7.16b, v20.16b, v21.16b
            uzp1        v20.16b, v6.16b, v7.16b
            uzp2        v22.16b, v6.16b, v7.16b
            mov         v21.d[0], v20.d[1]

            subs        x2, x2, #8
            mov         v23.8b, v3.8b

            bge         1b

            cmp         x2, #-8
            blt         1f

            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
            beq         9f

            /* fill the vector  with a safe value */
4:          ld4r        {v0.8b-v3.8b}, [x1]
            tbz         x2, #2, 2f
            ld4         {v0.b-v3.b}[0], [x1], #4
            ld4         {v0.b-v3.b}[1], [x1], #4
            ld4         {v0.b-v3.b}[2], [x1], #4
            ld4         {v0.b-v3.b}[3], [x1], #4
2:          tbz         x2, #1, 2f
            ld4         {v0.b-v3.b}[4], [x1], #4
            ld4         {v0.b-v3.b}[5], [x1], #4
2:          tbz         x2, #0, 2f
            ld4         {v0.b-v3.b}[6], [x1], #4
2:          b           3b

1:          tst         x2, #4
            beq         2f
            st4         {v20.b-v23.b}[0], [x0], #4
            st4         {v20.b-v23.b}[1], [x0], #4
            st4         {v20.b-v23.b}[2], [x0], #4
            st4         {v20.b-v23.b}[3], [x0], #4
2:          tst         x2, #2
            beq         2f
            st4         {v20.b-v23.b}[4], [x0], #4
            st4         {v20.b-v23.b}[5], [x0], #4
2:          tst         x2, #1
            beq         9f
            st4         {v20.b-v23.b}[6], [x0], #4

9:          ldp         d14, d15, [sp, #48]
            ldp         d12, d13, [sp, #32]
            ldp         d10, d11, [sp, #16]
            ldp         d8, d9, [sp], #64
            ret
END(rsdIntrinsic3DLUT_K)