aboutsummaryrefslogtreecommitdiff
path: root/renderscript-toolkit/src/main/cpp/Convolve_neon.S
diff options
context:
space:
mode:
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Convolve_neon.S')
-rw-r--r--renderscript-toolkit/src/main/cpp/Convolve_neon.S287
1 files changed, 287 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Convolve_neon.S b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
new file mode 100644
index 0000000..ee10884
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ r0 = dst
+ r1 = y0 base pointer
+ r2 = y1 base pointer
+ r3 = y2 base pointer
+ sp = coeffs
+ sp = length / 2
+*/
+
+#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+ push {r4-r8, r10, r11, lr}
+ vpush {q4-q7}
+
+ /* Get the coeffs pointer from the stack and load the
+ coefficients in the q0, q1 NEON registers */
+ ldr r4, [sp, #32+64]
+ vld1.16 {q0, q1}, [r4]
+
+ /* Get count from the stack */
+ ldr r4, [sp, #36+64]
+
+ /* Load the frequently used immediate in a register */
+ mov r5, #8
+
+1:
+ /* Load and post-increase the address by r5=#8 */
+ vld1.8 {q13}, [r1], r5
+ vld1.8 {q14}, [r2], r5
+ vld1.8 {q15}, [r3], r5
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r1, r5]
+ pld [r2, r5]
+ pld [r3, r5]
+
+ vmovl.u8 q2, d26
+ vmovl.u8 q3, d27
+ vmovl.u8 q4, d28
+ vmovl.u8 q5, d29
+ vmovl.u8 q6, d30
+ vmovl.u8 q7, d31
+
+/*
+ The two pixel source array is
+ d4, d5, d6, d7
+ d8, d9, d10, d11
+ d12, d13, d14, d15
+*/
+
+ vmull.s16 q8, d4, d0[0]
+ vmlal.s16 q8, d5, d0[1]
+ vmlal.s16 q8, d6, d0[2]
+ vmlal.s16 q8, d8, d0[3]
+ vmlal.s16 q8, d9, d1[0]
+ vmlal.s16 q8, d10, d1[1]
+ vmlal.s16 q8, d12, d1[2]
+ vmlal.s16 q8, d13, d1[3]
+ vmlal.s16 q8, d14, d2[0]
+
+ vmull.s16 q9, d5, d0[0]
+ vmlal.s16 q9, d6, d0[1]
+ vmlal.s16 q9, d7, d0[2]
+ vmlal.s16 q9, d9, d0[3]
+ vmlal.s16 q9, d10, d1[0]
+ vmlal.s16 q9, d11, d1[1]
+ vmlal.s16 q9, d13, d1[2]
+ vmlal.s16 q9, d14, d1[3]
+ vmlal.s16 q9, d15, d2[0]
+
+ vshrn.i32 d16, q8, #8
+ vshrn.i32 d17, q9, #8
+
+ vqmovun.s16 d16, q8
+ vst1.8 d16, [r0]!
+
+ /* Are we done yet? */
+ subs r4, r4, #1
+ bne 1b
+
+ /* We're done, bye! */
+ vpop {q4-q7}
+ pop {r4-r8, r10, r11, lr}
+ bx lr
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+ r0 = dst
+ r1 = y0 base pointer
+ r2 = y1 base pointer
+ r3 = y2 base pointer
+ r4 = y3 base pointer
+ r5 = y4 base pointer
+ r6 = coeffs
+ r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+ push {r4-r7, lr}
+ vpush {q4-q7}
+
+ /* load y3 in r4 */
+ ldr r4, [sp, #20 + 64]
+
+ /* load y4 in r5 */
+ ldr r5, [sp, #24 + 64]
+
+ /* Load the coefficients pointer */
+ ldr r6, [sp, #28 + 64]
+
+ /* Create the coefficients vector */
+ vld1.16 {d0, d1, d2, d3}, [r6]!
+ vld1.16 {d4, d5, d6}, [r6]
+
+ vmov.u32 q15, #0x7f
+
+ /* load the count */
+ ldr r6, [sp, #32 + 64]
+
+ /* Load the frequently used immediate in a register */
+ mov r7, #8
+
+1:
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 )
+ vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r1, r7]
+ pld [r2, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+ vmovl.u8 q12, d27
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+ vmull.s16 q4, d18, d0[0]
+ vmlal.s16 q4, d19, d0[1]
+ vmlal.s16 q4, d20, d0[2]
+ vmlal.s16 q4, d21, d0[3]
+ vmlal.s16 q4, d22, d1[0]
+
+ vmlal.s16 q4, d24, d1[1]
+ vmlal.s16 q4, d25, d1[2]
+ vmlal.s16 q4, d26, d1[3]
+ vmlal.s16 q4, d27, d2[0]
+ vmlal.s16 q4, d28, d2[1]
+
+ vmull.s16 q5, d19, d0[0]
+ vmlal.s16 q5, d20, d0[1]
+ vmlal.s16 q5, d21, d0[2]
+ vmlal.s16 q5, d22, d0[3]
+ vmlal.s16 q5, d23, d1[0]
+
+ vmlal.s16 q5, d25, d1[1]
+ vmlal.s16 q5, d26, d1[2]
+ vmlal.s16 q5, d27, d1[3]
+ vmlal.s16 q5, d28, d2[0]
+ vmlal.s16 q5, d29, d2[1]
+
+
+ /* Next 2 rows */
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y )
+ vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r3, r7]
+ pld [r4, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+ vmovl.u8 q12, d27
+ vmovl.u8 q13, d28
+ vmovl.u8 q14, d29
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+ vmlal.s16 q4, d18, d2[2]
+ vmlal.s16 q4, d19, d2[3]
+ vmlal.s16 q4, d20, d3[0]
+ vmlal.s16 q4, d21, d3[1]
+ vmlal.s16 q4, d22, d3[2]
+
+ vmlal.s16 q4, d24, d3[3]
+ vmlal.s16 q4, d25, d4[0]
+ vmlal.s16 q4, d26, d4[1]
+ vmlal.s16 q4, d27, d4[2]
+ vmlal.s16 q4, d28, d4[3]
+
+ vmlal.s16 q5, d19, d2[2]
+ vmlal.s16 q5, d20, d2[3]
+ vmlal.s16 q5, d21, d3[0]
+ vmlal.s16 q5, d22, d3[1]
+ vmlal.s16 q5, d23, d3[2]
+
+ vmlal.s16 q5, d25, d3[3]
+ vmlal.s16 q5, d26, d4[0]
+ vmlal.s16 q5, d27, d4[1]
+ vmlal.s16 q5, d28, d4[2]
+ vmlal.s16 q5, d29, d4[3]
+
+ /* Last row */
+ /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+ vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 )
+
+ /* Signal memory for data that will be used in the loop after the next */
+ pld [r5, r7]
+
+ /* Promoting the 8bit channels to 16bit */
+ vmovl.u8 q9, d24
+ vmovl.u8 q10, d25
+ vmovl.u8 q11, d26
+
+/*
+ d18, d19, d20, d21, d22, d23,
+ d24, d25
+*/
+
+ vmlal.s16 q4, d18, d5[0]
+ vmlal.s16 q4, d19, d5[1]
+ vmlal.s16 q4, d20, d5[2]
+ vmlal.s16 q4, d21, d5[3]
+ vmlal.s16 q4, d22, d6[0]
+
+ vmlal.s16 q5, d19, d5[0]
+ vmlal.s16 q5, d20, d5[1]
+ vmlal.s16 q5, d21, d5[2]
+ vmlal.s16 q5, d22, d5[3]
+ vmlal.s16 q5, d23, d6[0]
+
+
+
+ vadd.i32 q4, q4, q15
+ vadd.i32 q5, q5, q15
+
+/* Narrow it to a d-reg 32 -> 16 bit */
+ vrshrn.i32 d8, q4, #8
+ vrshrn.i32 d9, q5, #8
+
+
+/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+ vqmovun.s16 d8, q4
+
+ vst1.8 d8, [r0]! @ return the output and increase the address of r0
+
+ /* Are we done? */
+ subs r6, r6, #1
+ bne 1b
+
+ /* Yup, bye */
+ vpop {q4-q7}
+ pop {r4-r7, lr}
+ bx lr
+
+END(rsdIntrinsicConvolve5x5_K)