diff options
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Convolve_neon.S')
-rw-r--r-- | renderscript-toolkit/src/main/cpp/Convolve_neon.S | 287 |
1 files changed, 287 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Convolve_neon.S b/renderscript-toolkit/src/main/cpp/Convolve_neon.S new file mode 100644 index 0000000..ee10884 --- /dev/null +++ b/renderscript-toolkit/src/main/cpp/Convolve_neon.S @@ -0,0 +1,287 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + r0 = dst + r1 = y0 base pointer + r2 = y1 base pointer + r3 = y2 base pointer + sp = coeffs + sp = length / 2 +*/ + +#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart +#define END(f) .fnend; .size f, .-f; + +ENTRY(rsdIntrinsicConvolve3x3_K) + push {r4-r8, r10, r11, lr} + vpush {q4-q7} + + /* Get the coeffs pointer from the stack and load the + coefficients in the q0, q1 NEON registers */ + ldr r4, [sp, #32+64] + vld1.16 {q0, q1}, [r4] + + /* Get count from the stack */ + ldr r4, [sp, #36+64] + + /* Load the frequently used immediate in a register */ + mov r5, #8 + +1: + /* Load and post-increase the address by r5=#8 */ + vld1.8 {q13}, [r1], r5 + vld1.8 {q14}, [r2], r5 + vld1.8 {q15}, [r3], r5 + + /* Signal memory for data that will be used in the loop after the next */ + pld [r1, r5] + pld [r2, r5] + pld [r3, r5] + + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vmovl.u8 q7, d31 + +/* + The two pixel source array is + d4, d5, d6, d7 + d8, d9, d10, d11 + d12, d13, d14, d15 +*/ + + vmull.s16 q8, d4, d0[0] + vmlal.s16 q8, d5, d0[1] + vmlal.s16 q8, d6, d0[2] + vmlal.s16 q8, d8, d0[3] + vmlal.s16 q8, d9, d1[0] + vmlal.s16 q8, d10, d1[1] + vmlal.s16 q8, d12, d1[2] + vmlal.s16 q8, d13, d1[3] + vmlal.s16 q8, d14, d2[0] + + vmull.s16 q9, d5, d0[0] + vmlal.s16 q9, d6, d0[1] + vmlal.s16 q9, d7, d0[2] + vmlal.s16 q9, d9, d0[3] + vmlal.s16 q9, d10, d1[0] + vmlal.s16 q9, d11, d1[1] + vmlal.s16 q9, d13, d1[2] + vmlal.s16 q9, d14, d1[3] + vmlal.s16 q9, d15, d2[0] + + vshrn.i32 d16, q8, #8 + vshrn.i32 d17, q9, #8 + + vqmovun.s16 d16, q8 + vst1.8 d16, [r0]! + + /* Are we done yet? */ + subs r4, r4, #1 + bne 1b + + /* We're done, bye! */ + vpop {q4-q7} + pop {r4-r8, r10, r11, lr} + bx lr +END(rsdIntrinsicConvolve3x3_K) + + +/* Convolve 5x5 */ + +/* + r0 = dst + r1 = y0 base pointer + r2 = y1 base pointer + r3 = y2 base pointer + r4 = y3 base pointer + r5 = y4 base pointer + r6 = coeffs + r7 = length +*/ +ENTRY(rsdIntrinsicConvolve5x5_K) + push {r4-r7, lr} + vpush {q4-q7} + + /* load y3 in r4 */ + ldr r4, [sp, #20 + 64] + + /* load y4 in r5 */ + ldr r5, [sp, #24 + 64] + + /* Load the coefficients pointer */ + ldr r6, [sp, #28 + 64] + + /* Create the coefficients vector */ + vld1.16 {d0, d1, d2, d3}, [r6]! + vld1.16 {d4, d5, d6}, [r6] + + vmov.u32 q15, #0x7f + + /* load the count */ + ldr r6, [sp, #32 + 64] + + /* Load the frequently used immediate in a register */ + mov r7, #8 + +1: + /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ + vld1.8 {d24, d25, d26}, [r1], r7 @ y0 ( y - 2 ) + vld1.8 {d27, d28, d29}, [r2], r7 @ y0 ( y - 1 ) + + /* Signal memory for data that will be used in the loop after the next */ + pld [r1, r7] + pld [r2, r7] + + /* Promoting the 8bit channels to 16bit */ + vmovl.u8 q9, d24 + vmovl.u8 q10, d25 + vmovl.u8 q11, d26 + vmovl.u8 q12, d27 + vmovl.u8 q13, d28 + vmovl.u8 q14, d29 + +/* + d18, d19, d20, d21, d22, d23, + d24, d25 +*/ + vmull.s16 q4, d18, d0[0] + vmlal.s16 q4, d19, d0[1] + vmlal.s16 q4, d20, d0[2] + vmlal.s16 q4, d21, d0[3] + vmlal.s16 q4, d22, d1[0] + + vmlal.s16 q4, d24, d1[1] + vmlal.s16 q4, d25, d1[2] + vmlal.s16 q4, d26, d1[3] + vmlal.s16 q4, d27, d2[0] + vmlal.s16 q4, d28, d2[1] + + vmull.s16 q5, d19, d0[0] + vmlal.s16 q5, d20, d0[1] + vmlal.s16 q5, d21, d0[2] + vmlal.s16 q5, d22, d0[3] + vmlal.s16 q5, d23, d1[0] + + vmlal.s16 q5, d25, d1[1] + vmlal.s16 q5, d26, d1[2] + vmlal.s16 q5, d27, d1[3] + vmlal.s16 q5, d28, d2[0] + vmlal.s16 q5, d29, d2[1] + + + /* Next 2 rows */ + /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ + vld1.8 {d24, d25, d26}, [r3], r7 @ y0 ( y ) + vld1.8 {d27, d28, d29}, [r4], r7 @ y0 ( y + 1 ) + + /* Signal memory for data that will be used in the loop after the next */ + pld [r3, r7] + pld [r4, r7] + + /* Promoting the 8bit channels to 16bit */ + vmovl.u8 q9, d24 + vmovl.u8 q10, d25 + vmovl.u8 q11, d26 + vmovl.u8 q12, d27 + vmovl.u8 q13, d28 + vmovl.u8 q14, d29 + +/* + d18, d19, d20, d21, d22, d23, + d24, d25 +*/ + vmlal.s16 q4, d18, d2[2] + vmlal.s16 q4, d19, d2[3] + vmlal.s16 q4, d20, d3[0] + vmlal.s16 q4, d21, d3[1] + vmlal.s16 q4, d22, d3[2] + + vmlal.s16 q4, d24, d3[3] + vmlal.s16 q4, d25, d4[0] + vmlal.s16 q4, d26, d4[1] + vmlal.s16 q4, d27, d4[2] + vmlal.s16 q4, d28, d4[3] + + vmlal.s16 q5, d19, d2[2] + vmlal.s16 q5, d20, d2[3] + vmlal.s16 q5, d21, d3[0] + vmlal.s16 q5, d22, d3[1] + vmlal.s16 q5, d23, d3[2] + + vmlal.s16 q5, d25, d3[3] + vmlal.s16 q5, d26, d4[0] + vmlal.s16 q5, d27, d4[1] + vmlal.s16 q5, d28, d4[2] + vmlal.s16 q5, d29, d4[3] + + /* Last row */ + /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */ + vld1.8 {d24, d25, d26}, [r5], r7 @ y0 ( y + 2 ) + + /* Signal memory for data that will be used in the loop after the next */ + pld [r5, r7] + + /* Promoting the 8bit channels to 16bit */ + vmovl.u8 q9, d24 + vmovl.u8 q10, d25 + vmovl.u8 q11, d26 + +/* + d18, d19, d20, d21, d22, d23, + d24, d25 +*/ + + vmlal.s16 q4, d18, d5[0] + vmlal.s16 q4, d19, d5[1] + vmlal.s16 q4, d20, d5[2] + vmlal.s16 q4, d21, d5[3] + vmlal.s16 q4, d22, d6[0] + + vmlal.s16 q5, d19, d5[0] + vmlal.s16 q5, d20, d5[1] + vmlal.s16 q5, d21, d5[2] + vmlal.s16 q5, d22, d5[3] + vmlal.s16 q5, d23, d6[0] + + + + vadd.i32 q4, q4, q15 + vadd.i32 q5, q5, q15 + +/* Narrow it to a d-reg 32 -> 16 bit */ + vrshrn.i32 d8, q4, #8 + vrshrn.i32 d9, q5, #8 + + +/* Pack 16 -> 8 bit, saturate, put two pixels into D reg */ + vqmovun.s16 d8, q4 + + vst1.8 d8, [r0]! @ return the output and increase the address of r0 + + /* Are we done? */ + subs r6, r6, #1 + bne 1b + + /* Yup, bye */ + vpop {q4-q7} + pop {r4-r7, lr} + bx lr + +END(rsdIntrinsicConvolve5x5_K) |