1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
|
/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
#define END(f) .size f, .-f;
.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
smov x6, \src0
smov x7, \src1
add x6, x6, x3
add x7, x7, x3
ld1 {v16.2s}, [x6], x4
ld1 {v17.2s}, [x7], x4
ld1 {v18.2s}, [x6], x5
ld1 {v19.2s}, [x7], x5
dup v8.8b, \yr0
dup v9.8b, \yr1
/* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
zip1 v12.16b, v5.16b, v16.16b
zip1 v13.16b, v5.16b, v17.16b
umlsl v12.8h, v16.8b, v8.8b
umlsl v13.8h, v17.8b, v9.8b
umlal v12.8h, v18.8b, v8.8b
umlal v13.8h, v19.8b, v9.8b
ld1 {v18.2s}, [x6]
ld1 {v19.2s}, [x7]
sub x6, x6, x4
sub x7, x7, x4
ld1 {v16.2s}, [x6]
ld1 {v17.2s}, [x7]
/* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
zip1 v14.16b, v5.16b, v16.16b
zip1 v15.16b, v5.16b, v17.16b
umlsl v14.8h, v16.8b, v8.8b
umlsl v15.8h, v17.8b, v9.8b
umlal v14.8h, v18.8b, v8.8b
umlal v15.8h, v19.8b, v9.8b
/* Z interpolate, lane 0 v12/v14 -> v10 */
ushll v8.4s, v12.4h, #8
ushll2 v9.4s, v12.8h, #8
umlsl v8.4s, v12.4h, \zr0
umlsl2 v9.4s, v12.8h, \zr0
umlal v8.4s, v14.4h, \zr0
umlal2 v9.4s, v14.8h, \zr0
rshrn v10.4h, v8.4s, #8
rshrn2 v10.8h, v9.4s, #8
/* Z interpolate, lane 1 v13/v15 -> v11 */
ushll v8.4s, v13.4h, #8
ushll2 v9.4s, v13.8h, #8
umlsl v8.4s, v13.4h, \zr1
umlsl2 v9.4s, v13.8h, \zr1
umlal v8.4s, v15.4h, \zr1
umlal2 v9.4s, v15.8h, \zr1
rshrn v11.4h, v8.4s, #8
rshrn2 v11.8h, v9.4s, #8
/* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
ushll v8.4s, v10.4h, #8
ushll v9.4s, v11.4h, #8
umlsl v8.4s, v10.4h, \xr0
umlsl v9.4s, v11.4h, \xr1
umlal2 v8.4s, v10.8h, \xr0
umlal2 v9.4s, v11.8h, \xr1
shrn v14.4h, v8.4s, #8
shrn2 v14.8h, v9.4s, #8
/* pack lanes 0-1 -> v6 */
.ifc \dst, v20.16b
uqrshrn2 \dst, v14.8h, #8
.else ; .ifc \dst, v21.16b
uqrshrn2 \dst, v14.8h, #8
.else
uqrshrn \dst, v14.8h, #8
.endif ; .endif
.endm
/* void rsdIntrinsic3DLUT_K(
* void *dst, // x0
* void const *in, // x1
* size_t count, // x2
* void const *lut, // x3
* int32_t pitchy, // w4
* int32_t pitchz, // w5
* int dimx, // w6
* int dimy, // w7
* int dimz); // [sp]
*/
ENTRY(rsdIntrinsic3DLUT_K)
ldr w8, [sp]
stp d8, d9, [sp, #-64]!
stp d10, d11, [sp, #16]
stp d12, d13, [sp, #32]
stp d14, d15, [sp, #48]
movi v4.8b, #1
ins v4.h[0], w6
ins v4.h[1], w7
ins v4.h[2], w8
ins v4.s[2], w4
ins v4.s[3], w5
movi v5.16b, #0
subs x2, x2, #8
bge 2f
cmp x2, #-8
ble 9f
b 4f
.align 6
1: st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
/* x0 = dst
* x1 = src
* x2 = count
* x3 = lut
* x4 = pitchy
* x5 = pitchz
* x6 = offset0
* x7 = offset1
*/
2: ld4 {v0.8b-v3.8b}, [x1], #32
/* v0,v1,v2,v3 source data
* v4 dimensions and pitches
*/
3: uxtl v0.8h, v0.8b
uxtl v1.8h, v1.8b
uxtl v2.8h, v2.8b
mul v0.8h, v0.8h, v4.h[0]
mul v1.8h, v1.8h, v4.h[1]
mul v2.8h, v2.8h, v4.h[2]
/* ursra below would be more accurate, but this can result in a dim.0 case
* where we try to read from the limit of the array and the limit +1 to
* interpolate, even though the fractional component is zero. Strictly this is
* correct, except for the llegal access problem.
*/
usra v0.8h, v0.8h, #8
usra v1.8h, v1.8h, #8
usra v2.8h, v2.8h, #8
ushr v12.8h, v0.8h, #8
ushr v13.8h, v1.8h, #8
ushr v14.8h, v2.8h, #8
bic v0.8h, #0xff, LSL #8
xtn v1.8b, v1.8h
bic v2.8h, #0xff, LSL #8
/* v0.8h,v1.8b,v2.hb fractional offset
* v12.8h,v13.8h,v14.8h integer offset
*/
ushll v6.4s, v12.4h, #2
ushll2 v7.4s, v12.8h, #2
uxtl v8.4s, v13.4h
uxtl2 v9.4s, v13.8h
uxtl v10.4s, v14.4h
uxtl2 v11.4s, v14.8h
mla v6.4s, v8.4s, v4.s[2]
mla v7.4s, v9.4s, v4.s[2]
mla v6.4s, v10.4s, v4.s[3]
mla v7.4s, v11.4s, v4.s[3]
/* v6,v7 list of table offsets */
/* lanes 0 and 1 */
lanepair dst=v20.8b, src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
/* lanes 2 and 3 */
lanepair dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
/* lanes 4 and 5 */
lanepair dst=v21.8b, src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
/* lanes 6 and 7 */
lanepair dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
uzp1 v6.16b, v20.16b, v21.16b
uzp2 v7.16b, v20.16b, v21.16b
uzp1 v20.16b, v6.16b, v7.16b
uzp2 v22.16b, v6.16b, v7.16b
mov v21.d[0], v20.d[1]
subs x2, x2, #8
mov v23.8b, v3.8b
bge 1b
cmp x2, #-8
blt 1f
st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
beq 9f
/* fill the vector with a safe value */
4: ld4r {v0.8b-v3.8b}, [x1]
tbz x2, #2, 2f
ld4 {v0.b-v3.b}[0], [x1], #4
ld4 {v0.b-v3.b}[1], [x1], #4
ld4 {v0.b-v3.b}[2], [x1], #4
ld4 {v0.b-v3.b}[3], [x1], #4
2: tbz x2, #1, 2f
ld4 {v0.b-v3.b}[4], [x1], #4
ld4 {v0.b-v3.b}[5], [x1], #4
2: tbz x2, #0, 2f
ld4 {v0.b-v3.b}[6], [x1], #4
2: b 3b
1: tst x2, #4
beq 2f
st4 {v20.b-v23.b}[0], [x0], #4
st4 {v20.b-v23.b}[1], [x0], #4
st4 {v20.b-v23.b}[2], [x0], #4
st4 {v20.b-v23.b}[3], [x0], #4
2: tst x2, #2
beq 2f
st4 {v20.b-v23.b}[4], [x0], #4
st4 {v20.b-v23.b}[5], [x0], #4
2: tst x2, #1
beq 9f
st4 {v20.b-v23.b}[6], [x0], #4
9: ldp d14, d15, [sp, #48]
ldp d12, d13, [sp, #32]
ldp d10, d11, [sp, #16]
ldp d8, d9, [sp], #64
ret
END(rsdIntrinsic3DLUT_K)
|