1 files changed, 287 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Convolve_neon.S b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
new file mode 100644
index 0000000..ee10884
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        sp = coeffs
+        sp = length / 2
+*/
+
+#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        /* Get the coeffs pointer from the stack and load the
+           coefficients in the q0, q1 NEON registers */
+        ldr r4, [sp, #32+64]
+        vld1.16 {q0, q1}, [r4]
+
+        /* Get count from the stack */
+        ldr r4, [sp, #36+64]
+
+        /* Load the frequently used immediate in a register */
+        mov r5, #8
+
+1:
+        /* Load and post-increase the address by r5=#8 */
+        vld1.8 {q13}, [r1], r5
+        vld1.8 {q14}, [r2], r5
+        vld1.8 {q15}, [r3], r5
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r5]
+        pld         [r2, r5]
+        pld         [r3, r5]
+
+        vmovl.u8 q2, d26
+        vmovl.u8 q3, d27
+        vmovl.u8 q4, d28
+        vmovl.u8 q5, d29
+        vmovl.u8 q6, d30
+        vmovl.u8 q7, d31
+
+/*
+        The two pixel source array is
+        d4,  d5,  d6,  d7
+        d8,  d9,  d10, d11
+        d12, d13, d14, d15
+*/
+
+        vmull.s16 q8, d4, d0[0]
+        vmlal.s16 q8, d5, d0[1]
+        vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q8, d8, d0[3]
+        vmlal.s16 q8, d9, d1[0]
+        vmlal.s16 q8, d10, d1[1]
+        vmlal.s16 q8, d12, d1[2]
+        vmlal.s16 q8, d13, d1[3]
+        vmlal.s16 q8, d14, d2[0]
+
+        vmull.s16 q9, d5, d0[0]
+        vmlal.s16 q9, d6, d0[1]
+        vmlal.s16 q9, d7, d0[2]
+        vmlal.s16 q9, d9, d0[3]
+        vmlal.s16 q9, d10, d1[0]
+        vmlal.s16 q9, d11, d1[1]
+        vmlal.s16 q9, d13, d1[2]
+        vmlal.s16 q9, d14, d1[3]
+        vmlal.s16 q9, d15, d2[0]
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d17, q9, #8
+
+        vqmovun.s16 d16, q8
+        vst1.8 d16, [r0]!
+
+        /* Are we done yet? */
+        subs r4, r4, #1
+        bne 1b
+
+        /* We're done, bye! */
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        r4 = y3 base pointer
+        r5 = y4 base pointer
+        r6 = coeffs
+        r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        push        {r4-r7, lr}
+        vpush       {q4-q7}
+
+        /* load y3 in r4 */
+        ldr     r4, [sp, #20 + 64]
+
+        /* load y4 in r5 */
+        ldr     r5, [sp, #24 + 64]
+
+        /* Load the coefficients pointer */
+        ldr     r6, [sp, #28 + 64]
+
+        /* Create the coefficients vector */
+        vld1.16     {d0, d1, d2, d3}, [r6]!
+        vld1.16     {d4, d5, d6}, [r6]
+
+        vmov.u32  q15, #0x7f
+
+        /* load the count */
+        ldr     r6, [sp, #32 + 64]
+
+        /* Load the frequently used immediate in a register */
+        mov     r7, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
+        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r7]
+        pld         [r2, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmull.s16 q4, d18, d0[0]
+        vmlal.s16 q4, d19, d0[1]
+        vmlal.s16 q4, d20, d0[2]
+        vmlal.s16 q4, d21, d0[3]
+        vmlal.s16 q4, d22, d1[0]
+
+        vmlal.s16 q4, d24, d1[1]
+        vmlal.s16 q4, d25, d1[2]
+        vmlal.s16 q4, d26, d1[3]
+        vmlal.s16 q4, d27, d2[0]
+        vmlal.s16 q4, d28, d2[1]
+
+        vmull.s16 q5, d19, d0[0]
+        vmlal.s16 q5, d20, d0[1]
+        vmlal.s16 q5, d21, d0[2]
+        vmlal.s16 q5, d22, d0[3]
+        vmlal.s16 q5, d23, d1[0]
+
+        vmlal.s16 q5, d25, d1[1]
+        vmlal.s16 q5, d26, d1[2]
+        vmlal.s16 q5, d27, d1[3]
+        vmlal.s16 q5, d28, d2[0]
+        vmlal.s16 q5, d29, d2[1]
+
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
+        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r3, r7]
+        pld         [r4, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmlal.s16 q4, d18, d2[2]
+        vmlal.s16 q4, d19, d2[3]
+        vmlal.s16 q4, d20, d3[0]
+        vmlal.s16 q4, d21, d3[1]
+        vmlal.s16 q4, d22, d3[2]
+
+        vmlal.s16 q4, d24, d3[3]
+        vmlal.s16 q4, d25, d4[0]
+        vmlal.s16 q4, d26, d4[1]
+        vmlal.s16 q4, d27, d4[2]
+        vmlal.s16 q4, d28, d4[3]
+
+        vmlal.s16 q5, d19, d2[2]
+        vmlal.s16 q5, d20, d2[3]
+        vmlal.s16 q5, d21, d3[0]
+        vmlal.s16 q5, d22, d3[1]
+        vmlal.s16 q5, d23, d3[2]
+
+        vmlal.s16 q5, d25, d3[3]
+        vmlal.s16 q5, d26, d4[0]
+        vmlal.s16 q5, d27, d4[1]
+        vmlal.s16 q5, d28, d4[2]
+        vmlal.s16 q5, d29, d4[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r5, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+
+        vmlal.s16 q4, d18, d5[0]
+        vmlal.s16 q4, d19, d5[1]
+        vmlal.s16 q4, d20, d5[2]
+        vmlal.s16 q4, d21, d5[3]
+        vmlal.s16 q4, d22, d6[0]
+
+        vmlal.s16 q5, d19, d5[0]
+        vmlal.s16 q5, d20, d5[1]
+        vmlal.s16 q5, d21, d5[2]
+        vmlal.s16 q5, d22, d5[3]
+        vmlal.s16 q5, d23, d6[0]
+
+
+
+        vadd.i32 q4, q4, q15
+        vadd.i32 q5, q5, q15
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        vrshrn.i32 d8, q4, #8
+        vrshrn.i32 d9, q5, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        vqmovun.s16 d8, q4
+
+        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
+
+        /* Are we done? */
+        subs r6, r6, #1
+        bne 1b
+
+        /* Yup, bye */
+        vpop        {q4-q7}
+        pop         {r4-r7, lr}
+        bx          lr
+
+END(rsdIntrinsicConvolve5x5_K)