diff options
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Blur_neon.S')
-rw-r--r-- | renderscript-toolkit/src/main/cpp/Blur_neon.S | 1824 |
1 files changed, 1824 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Blur_neon.S b/renderscript-toolkit/src/main/cpp/Blur_neon.S new file mode 100644 index 0000000..241af5f --- /dev/null +++ b/renderscript-toolkit/src/main/cpp/Blur_neon.S @@ -0,0 +1,1824 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart +#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart +#define END(f) .fnend; .size f, .-f; + +#define ARCH_ARM_USE_BLUR_PRELOAD + +.eabi_attribute 25,1 @Tag_ABI_align8_preserved +.arm + +/* Number of fractional bits to preserve in intermediate results. The + * intermediate storage is 16-bit, and we started with 8 bit data (the integer + * part), so this should be between 0 and 8. + */ +.set FRACTION_BITS, 7 + +.set MAX_R, 25 + + +/* A quick way of making a line of code conditional on some other condition. + * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with + * `ifcc`: + */ +.macro ifcc zzz:vararg +.if cc + \zzz +.endif +.endm + +/* It's not always clear that prefetching is beneficial and this needs further + * testing on different cores, so it's made switchable here. + */ +#if defined(ARCH_ARM_USE_BLUR_PRELOAD) +#define VERTPLD(...) pld [__VA_ARGS__] +#else +#define VERTPLD(...) nop +#endif + +/* Fetch 16 columns of bytes (regardless of image format), convolve these + * vertically, and leave them in the register file. If working near the top or + * bottom of an image then clamp the addressing while loading the data in. + * + * The convolution is fully unrolled for windows up to max_r, with the + * outermost edges calculated first. This way it's possible to branch directly + * into the relevant part of the code for an arbitrary convolution radius. Two + * variants of the loop are produced; one eliminates the clamping code for a + * slight speed advantage. + * + * Where the macro is called with reg=x, the specified register is taken to + * contain a pre-calculated pointer into one of the two loops. + * + * Input: + * r1 -- src + * r2 -- pitch + * r5 -- r + * r6 -- rup (r, unless clipped to top of source image) + * r7 -- rdn (r, unless clipped to bottom of source image) + * r12 -- switch index + * q0-q3 -- coefficient table + * Output: + * r1 += 16 + * q10,q11 -- 16 convolved columns + * Modifies: + * r10 = upper row pointer + * r11 = lower row pointer + * q12-q15 = temporary sums + */ +.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/ + .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif + + vld1.8 {d30,d31}, [r1] + mls r10, r2, r6, r1 + + vmovl.u8 q14, d30 + VERTPLD(r1, #32) + vmovl.u8 q15, d31 + .if \max_r < 16 // approximate + ifcc adr \reg, 1f + .else + ifcc ldr \reg, 2f +1: ifcc add \reg, \reg, pc + .endif + + vmull.u16 q12, d28, d0[0] + ifcc sub \reg, r5, LSL #6 + vmull.u16 q13, d29, d0[0] + mla r11, r2, r7, r1 + vmull.u16 q14, d30, d0[0] + add r1, r1, #16 + vmull.u16 q15, d31, d0[0] + bx \reg + + ifcc .align 2 + 2: ifcc .word 1f-1b-8 + + /* This version of the vertical fetch loop body is used away from the edges + * of the source image. The pointers start at the top and bottom source rows + * and work their way towards the centre on each iteration. This way the + * number of taps used can be controlled by jumping directly into the middle + * of the loop and running to completion. + * If the loop body changes size then the code which calculates the address of + * the initial iteration must be updated to accordingly. + */ + .macro vertfetch_noclamp i, dreg + .if 0 < \i && \i <= \max_r + vld1.8 {d20,d21}, [r10], r2 + vld1.8 {d22,d23}, [r11] + sub r11, r11, r2 + vswp d21, d22 + VERTPLD(r10, #32) + vaddl.u8 q10, d20, d21 + vaddl.u8 q11, d22, d23 + vmlal.u16 q12, d20, \dreg + VERTPLD(r11, #32) + vmlal.u16 q13, d21, \dreg + vmlal.u16 q14, d22, \dreg + vmlal.u16 q15, d23, \dreg + .endif + .endm + + /* This version of the vertical fetch loop body is used near the edges of the + * source image, where one or both of the accesses may start with a clamped + * value, and the row addresses only begin to change after some number of + * iterations before the end. + * If the loop body changes size then the code which calculates the address of + * the initial iteration must be updated to accordingly. + */ + .macro vertfetch_clamped i, dreg + .if 0 < \i && \i <= \max_r + vld1.8 {d20,d21}, [r10] + vld1.8 {d22,d23}, [r11] + cmp r6, #\i + vswp d21, d22 + VERTPLD(r10, #32) + vaddl.u8 q10, d20, d21 + addhs r10, r10, r2 + vaddl.u8 q11, d22, d23 + cmp r7, #\i + vmlal.u16 q12, d20, \dreg + VERTPLD(r11, #32) + vmlal.u16 q13, d21, \dreg + subhs r11, r11, r2 + vmlal.u16 q14, d22, \dreg + nop + vmlal.u16 q15, d23, \dreg + .endif + .endm + + /* Entry into this unrolled loop is computed as a negative index from + * \labelc at the end of the block. + */ + .align 4 + vertfetch_clamped 27, d6[3] + vertfetch_clamped 26, d6[2] + vertfetch_clamped 25, d6[1] + vertfetch_clamped 24, d6[0] + vertfetch_clamped 23, d5[3] + vertfetch_clamped 22, d5[2] + vertfetch_clamped 21, d5[1] + vertfetch_clamped 20, d5[0] + vertfetch_clamped 19, d4[3] + vertfetch_clamped 18, d4[2] + vertfetch_clamped 17, d4[1] + vertfetch_clamped 16, d4[0] + vertfetch_clamped 15, d3[3] + vertfetch_clamped 14, d3[2] + vertfetch_clamped 13, d3[1] + vertfetch_clamped 12, d3[0] + vertfetch_clamped 11, d2[3] + vertfetch_clamped 10, d2[2] + vertfetch_clamped 9, d2[1] + vertfetch_clamped 8, d2[0] + vertfetch_clamped 7, d1[3] + vertfetch_clamped 6, d1[2] + vertfetch_clamped 5, d1[1] + vertfetch_clamped 4, d1[0] + vertfetch_clamped 3, d0[3] + vertfetch_clamped 2, d0[2] + vertfetch_clamped 1, d0[1] + vertfetch_clamped 0, d0[0] + 1: + \labelc : b 2f /* done with clamped loop, skip over non-clamped loop */ + + /* Entry into this unrolled loop is computed as a negative index from + * \labelnc at the end of the block. + */ + .align 4 + vertfetch_noclamp 27, d6[3] + vertfetch_noclamp 26, d6[2] + vertfetch_noclamp 25, d6[1] + vertfetch_noclamp 24, d6[0] + vertfetch_noclamp 23, d5[3] + vertfetch_noclamp 22, d5[2] + vertfetch_noclamp 21, d5[1] + vertfetch_noclamp 20, d5[0] + vertfetch_noclamp 19, d4[3] + vertfetch_noclamp 18, d4[2] + vertfetch_noclamp 17, d4[1] + vertfetch_noclamp 16, d4[0] + vertfetch_noclamp 15, d3[3] + vertfetch_noclamp 14, d3[2] + vertfetch_noclamp 13, d3[1] + vertfetch_noclamp 12, d3[0] + vertfetch_noclamp 11, d2[3] + vertfetch_noclamp 10, d2[2] + vertfetch_noclamp 9, d2[1] + vertfetch_noclamp 8, d2[0] + vertfetch_noclamp 7, d1[3] + vertfetch_noclamp 6, d1[2] + vertfetch_noclamp 5, d1[1] + vertfetch_noclamp 4, d1[0] + vertfetch_noclamp 3, d0[3] + vertfetch_noclamp 2, d0[2] + vertfetch_noclamp 1, d0[1] + vertfetch_noclamp 0, d0[0] + \labelnc : + + .purgem vertfetch_clamped + .purgem vertfetch_noclamp + + 2: vqrshrn.u32 d20, q12, #16 - FRACTION_BITS + vqrshrn.u32 d21, q13, #16 - FRACTION_BITS + vqrshrn.u32 d22, q14, #16 - FRACTION_BITS + vqrshrn.u32 d23, q15, #16 - FRACTION_BITS +.endm /*}}}*/ + +/* Some portion of the convolution window (as much as will fit, and all of it + * for the uchar1 cases) is kept in the register file to avoid unnecessary + * memory accesses. This forces the horizontal loops to be unrolled because + * there's no indexed addressing into the register file. + * + * As in the fetch macro, the operations are ordered from outside to inside, so + * that jumping into the middle of the block bypasses the unwanted window taps. + * + * There are several variants of the macro because of the fixed offets of the + * taps -- the wider the maximum radius the further the centre tap is from the + * most recently fetched data. This means that pre-filling the window requires + * more data that won't be used and it means that rotating the window involves + * more mov operations. + * + * When the buffer gets too big the buffer at [r9] is used. + * + * Input: + * q4-q11 -- convoltion window + * r9 -- pointer to additional convolution window data + * Output: + * r9 -- updated buffer pointer (if used) + * d31 -- result to be stored + * Modifies: + * r12 -- temp buffer pointer + * q12-q13 -- temporaries for load and vext operations. + * q14-q15 -- intermediate sums + */ +#define TUNED_LIST1 8, 16 +.macro hconv1_8/*{{{*/ + vmull.u16 q14, d18, d0[0] + vmull.u16 q15, d19, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + .word 107f-100b + .word 108f-100b + 108: vmlal.u16 q14, d16, d2[0] + vmlal.u16 q15, d17, d2[0] + vmlal.u16 q14, d20, d2[0] + vmlal.u16 q15, d21, d2[0] + 107: vext.u16 q12, q8, q9, #1 + vext.u16 q13, q9, q10, #7 + vmlal.u16 q14, d24, d1[3] + vmlal.u16 q15, d25, d1[3] + vmlal.u16 q14, d26, d1[3] + vmlal.u16 q15, d27, d1[3] + 106: vext.u16 q12, q8, q9, #2 + vext.u16 q13, q9, q10, #6 + vmlal.u16 q14, d24, d1[2] + vmlal.u16 q15, d25, d1[2] + vmlal.u16 q14, d26, d1[2] + vmlal.u16 q15, d27, d1[2] + 105: vext.u16 q12, q8, q9, #3 + vext.u16 q13, q9, q10, #5 + vmlal.u16 q14, d24, d1[1] + vmlal.u16 q15, d25, d1[1] + vmlal.u16 q14, d26, d1[1] + vmlal.u16 q15, d27, d1[1] + 104: //vext.u16 q12, q8, q9, #4 + //vext.u16 q13, q9, q10, #4 + vmlal.u16 q14, d17, d1[0] + vmlal.u16 q15, d18, d1[0] + vmlal.u16 q14, d19, d1[0] + vmlal.u16 q15, d20, d1[0] + 103: vext.u16 q12, q8, q9, #5 + vext.u16 q13, q9, q10, #3 + vmlal.u16 q14, d24, d0[3] + vmlal.u16 q15, d25, d0[3] + vmlal.u16 q14, d26, d0[3] + vmlal.u16 q15, d27, d0[3] + 102: vext.u16 q12, q8, q9, #6 + vext.u16 q13, q9, q10, #2 + vmlal.u16 q14, d24, d0[2] + vmlal.u16 q15, d25, d0[2] + vmlal.u16 q14, d26, d0[2] + vmlal.u16 q15, d27, d0[2] + 101: vext.u16 q12, q8, q9, #7 + vext.u16 q13, q9, q10, #1 + vmlal.u16 q14, d24, d0[1] + vmlal.u16 q15, d25, d0[1] + vmlal.u16 q14, d26, d0[1] + vmlal.u16 q15, d27, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +.macro hconv1_16/*{{{*/ + vmull.u16 q14, d16, d0[0] + vmull.u16 q15, d17, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + .word 107f-100b + .word 108f-100b + .word 109f-100b + .word 110f-100b + .word 111f-100b + .word 112f-100b + .word 113f-100b + .word 114f-100b + .word 115f-100b + .word 116f-100b + 116: //vext.u16 q12, q6, q7, #0 + //vext.u16 q13, q10, q11, #0 + vmlal.u16 q14, d12, d4[0] + vmlal.u16 q15, d13, d4[0] + vmlal.u16 q14, d20, d4[0] + vmlal.u16 q15, d21, d4[0] + 115: vext.u16 q12, q6, q7, #1 + vext.u16 q13, q9, q10, #7 + vmlal.u16 q14, d24, d3[3] + vmlal.u16 q15, d25, d3[3] + vmlal.u16 q14, d26, d3[3] + vmlal.u16 q15, d27, d3[3] + 114: vext.u16 q12, q6, q7, #2 + vext.u16 q13, q9, q10, #6 + vmlal.u16 q14, d24, d3[2] + vmlal.u16 q15, d25, d3[2] + vmlal.u16 q14, d26, d3[2] + vmlal.u16 q15, d27, d3[2] + 113: vext.u16 q12, q6, q7, #3 + vext.u16 q13, q9, q10, #5 + vmlal.u16 q14, d24, d3[1] + vmlal.u16 q15, d25, d3[1] + vmlal.u16 q14, d26, d3[1] + vmlal.u16 q15, d27, d3[1] + 112: //vext.u16 q12, q6, q7, #4 + //vext.u16 q13, q9, q10, #4 + vmlal.u16 q14, d13, d3[0] + vmlal.u16 q15, d14, d3[0] + vmlal.u16 q14, d19, d3[0] + vmlal.u16 q15, d20, d3[0] + 111: vext.u16 q12, q6, q7, #5 + vext.u16 q13, q9, q10, #3 + vmlal.u16 q14, d24, d2[3] + vmlal.u16 q15, d25, d2[3] + vmlal.u16 q14, d26, d2[3] + vmlal.u16 q15, d27, d2[3] + 110: vext.u16 q12, q6, q7, #6 + vext.u16 q13, q9, q10, #2 + vmlal.u16 q14, d24, d2[2] + vmlal.u16 q15, d25, d2[2] + vmlal.u16 q14, d26, d2[2] + vmlal.u16 q15, d27, d2[2] + 109: vext.u16 q12, q6, q7, #7 + vext.u16 q13, q9, q10, #1 + vmlal.u16 q14, d24, d2[1] + vmlal.u16 q15, d25, d2[1] + vmlal.u16 q14, d26, d2[1] + vmlal.u16 q15, d27, d2[1] + 108: //vext.u16 q12, q7, q8, #0 + //vext.u16 q13, q9, q10, #0 + vmlal.u16 q14, d14, d2[0] + vmlal.u16 q15, d15, d2[0] + vmlal.u16 q14, d18, d2[0] + vmlal.u16 q15, d19, d2[0] + 107: vext.u16 q12, q7, q8, #1 + vext.u16 q13, q8, q9, #7 + vmlal.u16 q14, d24, d1[3] + vmlal.u16 q15, d25, d1[3] + vmlal.u16 q14, d26, d1[3] + vmlal.u16 q15, d27, d1[3] + 106: vext.u16 q12, q7, q8, #2 + vext.u16 q13, q8, q9, #6 + vmlal.u16 q14, d24, d1[2] + vmlal.u16 q15, d25, d1[2] + vmlal.u16 q14, d26, d1[2] + vmlal.u16 q15, d27, d1[2] + 105: vext.u16 q12, q7, q8, #3 + vext.u16 q13, q8, q9, #5 + vmlal.u16 q14, d24, d1[1] + vmlal.u16 q15, d25, d1[1] + vmlal.u16 q14, d26, d1[1] + vmlal.u16 q15, d27, d1[1] + 104: //vext.u16 q12, q7, q8, #4 + //vext.u16 q13, q8, q9, #4 + vmlal.u16 q14, d15, d1[0] + vmlal.u16 q15, d16, d1[0] + vmlal.u16 q14, d17, d1[0] + vmlal.u16 q15, d18, d1[0] + 103: vext.u16 q12, q7, q8, #5 + vext.u16 q13, q8, q9, #3 + vmlal.u16 q14, d24, d0[3] + vmlal.u16 q15, d25, d0[3] + vmlal.u16 q14, d26, d0[3] + vmlal.u16 q15, d27, d0[3] + 102: vext.u16 q12, q7, q8, #6 + vext.u16 q13, q8, q9, #2 + vmlal.u16 q14, d24, d0[2] + vmlal.u16 q15, d25, d0[2] + vmlal.u16 q14, d26, d0[2] + vmlal.u16 q15, d27, d0[2] + 101: vext.u16 q12, q7, q8, #7 + vext.u16 q13, q8, q9, #1 + vmlal.u16 q14, d24, d0[1] + vmlal.u16 q15, d25, d0[1] + vmlal.u16 q14, d26, d0[1] + vmlal.u16 q15, d27, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vmov q6, q7 + vmov q7, q8 + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +.macro hconv1_25/*{{{*/ + vext.u16 q12, q6, q7, #7 + vmull.u16 q14, d24, d0[0] + vmull.u16 q15, d25, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + .word 107f-100b + .word 108f-100b + .word 109f-100b + .word 110f-100b + .word 111f-100b + .word 112f-100b + .word 113f-100b + .word 114f-100b + .word 115f-100b + .word 116f-100b + .word 117f-100b + .word 118f-100b + .word 119f-100b + .word 120f-100b + .word 121f-100b + .word 122f-100b + .word 123f-100b + .word 124f-100b + .word 125f-100b + 125: vext.u16 q12, q3, q4, #6 + vext.u16 q13, q10, q11, #0 + vmlal.u16 q14, d24, d6[1] + vmlal.u16 q15, d25, d6[1] + vmlal.u16 q14, d26, d6[1] + vmlal.u16 q15, d27, d6[1] + 124: vext.u16 q12, q3, q4, #7 + vext.u16 q13, q9, q10, #7 + vmlal.u16 q14, d24, d6[0] + vmlal.u16 q15, d25, d6[0] + vmlal.u16 q14, d26, d6[0] + vmlal.u16 q15, d27, d6[0] + 123: vext.u16 q12, q4, q5, #0 + vext.u16 q13, q9, q10, #6 + vmlal.u16 q14, d24, d5[3] + vmlal.u16 q15, d25, d5[3] + vmlal.u16 q14, d26, d5[3] + vmlal.u16 q15, d27, d5[3] + 122: vext.u16 q12, q4, q5, #1 + vext.u16 q13, q9, q10, #5 + vmlal.u16 q14, d24, d5[2] + vmlal.u16 q15, d25, d5[2] + vmlal.u16 q14, d26, d5[2] + vmlal.u16 q15, d27, d5[2] + 121: vext.u16 q12, q4, q5, #2 + vext.u16 q13, q9, q10, #4 + vmlal.u16 q14, d24, d5[1] + vmlal.u16 q15, d25, d5[1] + vmlal.u16 q14, d26, d5[1] + vmlal.u16 q15, d27, d5[1] + 120: vext.u16 q12, q4, q5, #3 + vext.u16 q13, q9, q10, #3 + vmlal.u16 q14, d24, d5[0] + vmlal.u16 q15, d25, d5[0] + vmlal.u16 q14, d26, d5[0] + vmlal.u16 q15, d27, d5[0] + 119: vext.u16 q12, q4, q5, #4 + vext.u16 q13, q9, q10, #2 + vmlal.u16 q14, d24, d4[3] + vmlal.u16 q15, d25, d4[3] + vmlal.u16 q14, d26, d4[3] + vmlal.u16 q15, d27, d4[3] + 118: vext.u16 q12, q4, q5, #5 + vext.u16 q13, q9, q10, #1 + vmlal.u16 q14, d24, d4[2] + vmlal.u16 q15, d25, d4[2] + vmlal.u16 q14, d26, d4[2] + vmlal.u16 q15, d27, d4[2] + 117: vext.u16 q12, q4, q5, #6 + vext.u16 q13, q9, q10, #0 + vmlal.u16 q14, d24, d4[1] + vmlal.u16 q15, d25, d4[1] + vmlal.u16 q14, d26, d4[1] + vmlal.u16 q15, d27, d4[1] + 116: vext.u16 q12, q4, q5, #7 + vext.u16 q13, q8, q9, #7 + vmlal.u16 q14, d24, d4[0] + vmlal.u16 q15, d25, d4[0] + vmlal.u16 q14, d26, d4[0] + vmlal.u16 q15, d27, d4[0] + 115: vext.u16 q12, q5, q6, #0 + vext.u16 q13, q8, q9, #6 + vmlal.u16 q14, d24, d3[3] + vmlal.u16 q15, d25, d3[3] + vmlal.u16 q14, d26, d3[3] + vmlal.u16 q15, d27, d3[3] + 114: vext.u16 q12, q5, q6, #1 + vext.u16 q13, q8, q9, #5 + vmlal.u16 q14, d24, d3[2] + vmlal.u16 q15, d25, d3[2] + vmlal.u16 q14, d26, d3[2] + vmlal.u16 q15, d27, d3[2] + 113: vext.u16 q12, q5, q6, #2 + vext.u16 q13, q8, q9, #4 + vmlal.u16 q14, d24, d3[1] + vmlal.u16 q15, d25, d3[1] + vmlal.u16 q14, d26, d3[1] + vmlal.u16 q15, d27, d3[1] + 112: vext.u16 q12, q5, q6, #3 + vext.u16 q13, q8, q9, #3 + vmlal.u16 q14, d24, d3[0] + vmlal.u16 q15, d25, d3[0] + vmlal.u16 q14, d26, d3[0] + vmlal.u16 q15, d27, d3[0] + 111: vext.u16 q12, q5, q6, #4 + vext.u16 q13, q8, q9, #2 + vmlal.u16 q14, d24, d2[3] + vmlal.u16 q15, d25, d2[3] + vmlal.u16 q14, d26, d2[3] + vmlal.u16 q15, d27, d2[3] + 110: vext.u16 q12, q5, q6, #5 + vext.u16 q13, q8, q9, #1 + vmlal.u16 q14, d24, d2[2] + vmlal.u16 q15, d25, d2[2] + vmlal.u16 q14, d26, d2[2] + vmlal.u16 q15, d27, d2[2] + 109: vext.u16 q12, q5, q6, #6 + vext.u16 q13, q8, q9, #0 + vmlal.u16 q14, d24, d2[1] + vmlal.u16 q15, d25, d2[1] + vmlal.u16 q14, d26, d2[1] + vmlal.u16 q15, d27, d2[1] + 108: vext.u16 q12, q5, q6, #7 + vext.u16 q13, q7, q8, #7 + vmlal.u16 q14, d24, d2[0] + vmlal.u16 q15, d25, d2[0] + vmlal.u16 q14, d26, d2[0] + vmlal.u16 q15, d27, d2[0] + 107: vext.u16 q12, q6, q7, #0 + vext.u16 q13, q7, q8, #6 + vmlal.u16 q14, d24, d1[3] + vmlal.u16 q15, d25, d1[3] + vmlal.u16 q14, d26, d1[3] + vmlal.u16 q15, d27, d1[3] + 106: vext.u16 q12, q6, q7, #1 + vext.u16 q13, q7, q8, #5 + vmlal.u16 q14, d24, d1[2] + vmlal.u16 q15, d25, d1[2] + vmlal.u16 q14, d26, d1[2] + vmlal.u16 q15, d27, d1[2] + 105: vext.u16 q12, q6, q7, #2 + vext.u16 q13, q7, q8, #4 + vmlal.u16 q14, d24, d1[1] + vmlal.u16 q15, d25, d1[1] + vmlal.u16 q14, d26, d1[1] + vmlal.u16 q15, d27, d1[1] + 104: vext.u16 q12, q6, q7, #3 + vext.u16 q13, q7, q8, #3 + vmlal.u16 q14, d24, d1[0] + vmlal.u16 q15, d25, d1[0] + vmlal.u16 q14, d26, d1[0] + vmlal.u16 q15, d27, d1[0] + 103: vext.u16 q12, q6, q7, #4 + vext.u16 q13, q7, q8, #2 + vmlal.u16 q14, d24, d0[3] + vmlal.u16 q15, d25, d0[3] + vmlal.u16 q14, d26, d0[3] + vmlal.u16 q15, d27, d0[3] + 102: vext.u16 q12, q6, q7, #5 + vext.u16 q13, q7, q8, #1 + vmlal.u16 q14, d24, d0[2] + vmlal.u16 q15, d25, d0[2] + vmlal.u16 q14, d26, d0[2] + vmlal.u16 q15, d27, d0[2] + 101: vext.u16 q12, q6, q7, #6 + vext.u16 q13, q7, q8, #0 + vmlal.u16 q14, d24, d0[1] + vmlal.u16 q15, d25, d0[1] + vmlal.u16 q14, d26, d0[1] + vmlal.u16 q15, d27, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vmov d7, d9 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 + vmov q7, q8 + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +#define TUNED_LIST4 6, 12 +.macro hconv4_6/*{{{*/ + vmull.u16 q14, d14, d0[0] + vmull.u16 q15, d15, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + 106: vmlal.u16 q14, d8, d1[2] + vmlal.u16 q15, d9, d1[2] + vmlal.u16 q14, d20, d1[2] + vmlal.u16 q15, d21, d1[2] + 105: vmlal.u16 q14, d9, d1[1] + vmlal.u16 q15, d10, d1[1] + vmlal.u16 q14, d19, d1[1] + vmlal.u16 q15, d20, d1[1] + 104: vmlal.u16 q14, d10, d1[0] + vmlal.u16 q15, d11, d1[0] + vmlal.u16 q14, d18, d1[0] + vmlal.u16 q15, d19, d1[0] + 103: vmlal.u16 q14, d11, d0[3] + vmlal.u16 q15, d12, d0[3] + vmlal.u16 q14, d17, d0[3] + vmlal.u16 q15, d18, d0[3] + 102: vmlal.u16 q14, d12, d0[2] + vmlal.u16 q15, d13, d0[2] + vmlal.u16 q14, d16, d0[2] + vmlal.u16 q15, d17, d0[2] + 101: vmlal.u16 q14, d13, d0[1] + vmlal.u16 q15, d14, d0[1] + vmlal.u16 q14, d15, d0[1] + vmlal.u16 q15, d16, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 + vmov q7, q8 + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +.macro hconv4_12/*{{{*/ + vmull.u16 q14, d8, d0[0] + vmull.u16 q15, d9, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + .word 107f-100b + .word 108f-100b + .word 109f-100b + .word 110f-100b + .word 111f-100b + .word 112f-100b + 112: add r12, r9, #0x1a0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d3[0] + vmlal.u16 q15, d25, d3[0] + vmlal.u16 q14, d20, d3[0] + vmlal.u16 q15, d21, d3[0] + 111: add r12, r9, #0x1a8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmlal.u16 q14, d24, d2[3] + vmlal.u16 q15, d25, d2[3] + vmlal.u16 q14, d19, d2[3] + vmlal.u16 q15, d20, d2[3] + 110: add r12, r9, #0x1b0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d2[2] + vmlal.u16 q15, d25, d2[2] + vmlal.u16 q14, d18, d2[2] + vmlal.u16 q15, d19, d2[2] + 109: add r12, r9, #0x1b8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmlal.u16 q14, d24, d2[1] + vmlal.u16 q15, d25, d2[1] + vmlal.u16 q14, d17, d2[1] + vmlal.u16 q15, d18, d2[1] + 108: add r12, r9, #0x1c0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d2[0] + vmlal.u16 q15, d25, d2[0] + vmlal.u16 q14, d16, d2[0] + vmlal.u16 q15, d17, d2[0] + 107: add r12, r9, #0x1c8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmlal.u16 q14, d24, d1[3] + vmlal.u16 q15, d25, d1[3] + vmlal.u16 q14, d15, d1[3] + vmlal.u16 q15, d16, d1[3] + 106: add r12, r9, #0x1d0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d1[2] + vmlal.u16 q15, d25, d1[2] + vmlal.u16 q14, d14, d1[2] + vmlal.u16 q15, d15, d1[2] + 105: add r12, r9, #0x1d8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmlal.u16 q14, d24, d1[1] + vmlal.u16 q15, d25, d1[1] + vmlal.u16 q14, d13, d1[1] + vmlal.u16 q15, d14, d1[1] + 104: add r12, r9, #0x1e0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d1[0] + vmlal.u16 q15, d25, d1[0] + vmlal.u16 q14, d12, d1[0] + vmlal.u16 q15, d13, d1[0] + 103: add r12, r9, #0x1e8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmlal.u16 q14, d24, d0[3] + vmlal.u16 q15, d25, d0[3] + vmlal.u16 q14, d11, d0[3] + vmlal.u16 q15, d12, d0[3] + 102: add r12, r9, #0x1f0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d0[2] + vmlal.u16 q15, d25, d0[2] + vmlal.u16 q14, d10, d0[2] + vmlal.u16 q15, d11, d0[2] + 101: add r12, r9, #0x1f8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64] + vmlal.u16 q14, d24, d0[1] + vmlal.u16 q15, d8, d0[1] + vmlal.u16 q14, d9, d0[1] + vmlal.u16 q15, d10, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vst1.u8 {q4}, [r9:128]! + bic r9, r9, #0x200 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 + vmov q7, q8 + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +.macro hconv4_25/*{{{*/ + add r12, r9, #0x198 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12:64] + vmull.u16 q14, d24, d0[0] + vmull.u16 q15, d25, d0[0] + + ldr r12, [pc, r5, LSL #2] + add pc, pc, r12 + bkpt + 100: .word 101f-100b + .word 102f-100b + .word 103f-100b + .word 104f-100b + .word 105f-100b + .word 106f-100b + .word 107f-100b + .word 108f-100b + .word 109f-100b + .word 110f-100b + .word 111f-100b + .word 112f-100b + .word 113f-100b + .word 114f-100b + .word 115f-100b + .word 116f-100b + .word 117f-100b + .word 118f-100b + .word 119f-100b + .word 120f-100b + .word 121f-100b + .word 122f-100b + .word 123f-100b + .word 124f-100b + .word 125f-100b + 125: add r12, r9, #0x0d0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d6[1] + vmlal.u16 q15, d25, d6[1] + vmlal.u16 q14, d20, d6[1] + vmlal.u16 q15, d21, d6[1] + 124: add r12, r9, #0x0d8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d6[0] + vmlal.u16 q15, d25, d6[0] + vmlal.u16 q14, d19, d6[0] + vmlal.u16 q15, d20, d6[0] + 123: add r12, r9, #0x0e0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d5[3] + vmlal.u16 q15, d25, d5[3] + vmlal.u16 q14, d18, d5[3] + vmlal.u16 q15, d19, d5[3] + 122: add r12, r9, #0x0e8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d5[2] + vmlal.u16 q15, d25, d5[2] + vmlal.u16 q14, d17, d5[2] + vmlal.u16 q15, d18, d5[2] + 121: add r12, r9, #0x0f0 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d5[1] + vmlal.u16 q15, d25, d5[1] + vmlal.u16 q14, d16, d5[1] + vmlal.u16 q15, d17, d5[1] + 120: add r12, r9, #0x0f8 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d5[0] + vmlal.u16 q15, d25, d5[0] + vmlal.u16 q14, d15, d5[0] + vmlal.u16 q15, d16, d5[0] + 119: add r12, r9, #0x100 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d4[3] + vmlal.u16 q15, d25, d4[3] + vmlal.u16 q14, d14, d4[3] + vmlal.u16 q15, d15, d4[3] + 118: add r12, r9, #0x108 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d4[2] + vmlal.u16 q15, d25, d4[2] + vmlal.u16 q14, d13, d4[2] + vmlal.u16 q15, d14, d4[2] + 117: add r12, r9, #0x110 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d4[1] + vmlal.u16 q15, d25, d4[1] + vmlal.u16 q14, d12, d4[1] + vmlal.u16 q15, d13, d4[1] + 116: add r12, r9, #0x118 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d4[0] + vmlal.u16 q15, d25, d4[0] + vmlal.u16 q14, d11, d4[0] + vmlal.u16 q15, d12, d4[0] + 115: add r12, r9, #0x120 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d3[3] + vmlal.u16 q15, d25, d3[3] + vmlal.u16 q14, d10, d3[3] + vmlal.u16 q15, d11, d3[3] + 114: add r12, r9, #0x128 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + vmlal.u16 q14, d24, d3[2] + vmlal.u16 q15, d25, d3[2] + vmlal.u16 q14, d9, d3[2] + vmlal.u16 q15, d10, d3[2] + 113: add r12, r9, #0x130 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + vmlal.u16 q14, d24, d3[1] + vmlal.u16 q15, d25, d3[1] + vmlal.u16 q14, d8, d3[1] + vmlal.u16 q15, d9, d3[1] + 112: add r12, r9, #0x138 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1f8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64] + vmlal.u16 q14, d24, d3[0] + vmlal.u16 q15, d25, d3[0] + vmlal.u16 q14, d26, d3[0] @ Could be d7, without the load, right? + vmlal.u16 q15, d8, d3[0] + 111: add r12, r9, #0x140 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + add r12, r9, #0x1f0 + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d2[3] + vmlal.u16 q15, d25, d2[3] + vmlal.u16 q14, d26, d2[3] + vmlal.u16 q15, d27, d2[3] + 110: add r12, r9, #0x148 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1e8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d27}, [r12:64] + vmlal.u16 q14, d24, d2[2] + vmlal.u16 q15, d25, d2[2] + vmlal.u16 q14, d26, d2[2] + vmlal.u16 q15, d27, d2[2] + 109: add r12, r9, #0x150 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + add r12, r9, #0x1e0 + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d2[1] + vmlal.u16 q15, d25, d2[1] + vmlal.u16 q14, d26, d2[1] + vmlal.u16 q15, d27, d2[1] + 108: add r12, r9, #0x158 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1d8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d27}, [r12:64] + vmlal.u16 q14, d24, d2[0] + vmlal.u16 q15, d25, d2[0] + vmlal.u16 q14, d26, d2[0] + vmlal.u16 q15, d27, d2[0] + 107: add r12, r9, #0x160 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + add r12, r9, #0x1d0 + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d1[3] + vmlal.u16 q15, d25, d1[3] + vmlal.u16 q14, d26, d1[3] + vmlal.u16 q15, d27, d1[3] + 106: add r12, r9, #0x168 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1c8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d27}, [r12:64] + vmlal.u16 q14, d24, d1[2] + vmlal.u16 q15, d25, d1[2] + vmlal.u16 q14, d26, d1[2] + vmlal.u16 q15, d27, d1[2] + 105: add r12, r9, #0x170 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + add r12, r9, #0x1c0 + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d1[1] + vmlal.u16 q15, d25, d1[1] + vmlal.u16 q14, d26, d1[1] + vmlal.u16 q15, d27, d1[1] + 104: add r12, r9, #0x178 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1b8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d27}, [r12:64] + vmlal.u16 q14, d24, d1[0] + vmlal.u16 q15, d25, d1[0] + vmlal.u16 q14, d26, d1[0] + vmlal.u16 q15, d27, d1[0] + 103: add r12, r9, #0x180 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128] + add r12, r9, #0x1b0 + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d0[3] + vmlal.u16 q15, d25, d0[3] + vmlal.u16 q14, d26, d0[3] + vmlal.u16 q15, d27, d0[3] + 102: add r12, r9, #0x188 + bic r12, r12, #0x200 + vld1.u16 {d24}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d25}, [r12] + add r12, r9, #0x1a8 + bic r12, r12, #0x200 + vld1.u16 {d26}, [r12:64]! + bic r12, r12, #0x200 + vld1.u16 {d27}, [r12:64] + vmlal.u16 q14, d24, d0[2] + vmlal.u16 q15, d25, d0[2] + vmlal.u16 q14, d26, d0[2] + vmlal.u16 q15, d27, d0[2] + 101: add r12, r9, #0x190 + bic r12, r12, #0x200 + vld1.u16 {d24,d25}, [r12:128]! + bic r12, r12, #0x200 + vld1.u16 {d26,d27}, [r12:128] + vmlal.u16 q14, d24, d0[1] + vmlal.u16 q15, d25, d0[1] + vmlal.u16 q14, d26, d0[1] + vmlal.u16 q15, d27, d0[1] + + vqrshrn.u32 d28, q14, #16 + vqrshrn.u32 d29, q15, #16 + vqrshrn.u16 d31, q14, #FRACTION_BITS + + vst1.u8 {q4}, [r9:128]! + bic r9, r9, #0x200 + vmov q4, q5 + vmov q5, q6 + vmov q6, q7 + vmov q7, q8 + vmov q8, q9 + vmov q9, q10 + vmov q10, q11 +.endm/*}}}*/ + +/* Dedicated function wrapper for the fetch macro, for the cases where + * performance isn't that important, to keep code size down. + */ +PRIVATE(fetch_generic_asm) + push {r10,r11} + fetch + pop {r10,r11} + bx lr +END(fetch_generic_asm) + + +/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory + * beyond that limit, and filling the rest of the vector with the last legal + * pixel. + * Result is in q10 and q11. q8 and q9 are filled with the first legal pixel. + * Note: This function can read beyond the right edge of input if the image is + * narrower than 16 bytes. + */ +PRIVATE(fetch_clampleft1) + push {r12,lr} + bl fetch_generic_asm + vdup.u16 q8, d20[0] + vdup.u16 q9, d20[0] + ands r12, r10, #15 + beq 1f + sub r1, r1, r12 + sub r10, r10, r12 + sub sp, sp, #32 + vst1.u16 {q10,q11}, [sp] + sub r12, sp, r12, LSL #1 + sub sp, sp, #32 + vst1.u16 {q8,q9}, [sp] + vld1.u16 {q10,q11}, [r12] + add sp, sp, #64 +1: pop {r12,pc} +END(fetch_clampleft1) + +PRIVATE(fetch_clampleft4) + push {r12,lr} + bl fetch_generic_asm + vmov.u16 d16, d20 + vmov.u16 d17, d20 + vmov.u16 d18, d20 + vmov.u16 d19, d20 + ands r12, r10, #15 + beq 1f + sub r1, r1, r12 + sub r10, r10, r12 + sub sp, sp, #32 + vst1.u16 {q10-q11}, [sp] + sub r12, sp, r12, LSL #1 + sub sp, sp, #32 + vst1.u16 {q8,q9}, [sp] + vld1.u16 {q10,q11}, [r12] + add sp, sp, #64 +1: pop {r12,pc} +END(fetch_clampleft4) + +/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding + * reading memory beyond that limit, and filling the rest of the vector with + * the last legal pixel. + * Result is in q10 and q11. q12 and q13 are filled with the last legal pixel. + * Note: This function can read beyond the left edge of input if the image is + * narrower than 16 bytes. + */ +PRIVATE(fetch_clampright1) + push {r12, lr} + rsb r12, r11, #0 + ands r12, r12, #15 + beq 1f + sub r1, r1, r12 + bl fetch_generic_asm + vdup.u16 q12, d23[3] + vdup.u16 q13, d23[3] + rsb r12, r11, #0 + and r12, r12, #15 + sub sp, sp, #32 + vst1.u16 {q12,q13}, [sp] + sub sp, sp, #32 + add r12, sp, r12, LSL #1 + vst1.u16 {q10,q11}, [sp] + vld1.u16 {q10,q11}, [r12] + add sp, sp, #64 + pop {r12,pc} +1: bl fetch_generic_asm + vdup.u16 q12, d23[3] + vdup.u16 q13, d23[3] + pop {r12,pc} +END(fetch_clampright1) + +PRIVATE(fetch_clampright4) + push {r12, lr} + rsb r12, r11, #0 + ands r12, r12, #15 + beq 1f + sub r1, r1, r12 + bl fetch_generic_asm + vmov.u16 d24, d23 + vmov.u16 d25, d23 + vmov.u16 d26, d23 + vmov.u16 d27, d23 + rsb r12, r11, #0 + and r12, r12, #15 + sub sp, sp, #32 + vst1.u16 {q12-q13}, [sp] + sub sp, sp, #32 + add r12, sp, r12, LSL #1 + vst1.u16 {q10,q11}, [sp] + vld1.u16 {q10,q11}, [r12] + add sp, sp, #64 + pop {r12,pc} +1: bl fetch_generic_asm + vmov.u16 d24, d23 + vmov.u16 d25, d23 + vmov.u16 d26, d23 + vmov.u16 d27, d23 + pop {r12,pc} +END(fetch_clampright4) + +/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th + * value across to fill the rest of the register pair. Used for filling the + * right hand edge of the window when reading too close to the right hand edge + * of the image. + * Also returns a dup-ed copy of the last element in q12 for the tail-fill + * case (this happens incidentally in common path, but must be done + * deliberately in the fast-out path). + */ +PRIVATE(prefill_sweepright1) + ands r12, r11, #15 + beq 1f + sub r12, r12, #1 + sub sp, sp, #64 + vst1.u16 {q10,q11}, [sp] + add r12, sp, r12, LSL #1 + vld1.u16 {d24[],d25[]}, [r12] + vld1.u16 {d26[],d27[]}, [r12] + vst1.u16 {q12,q13}, [r12] + vld1.u16 {q10,q11}, [sp] + add sp, sp, #64 + bx lr +1: vdup.u16 q12, d23[3] + vdup.u16 q13, d23[3] + bx lr +END(prefill_sweepright1) + +PRIVATE(prefill_sweepright4) + ands r12, r11, #15 + beq 1f + sub r12, r12, #4 + sub sp, sp, #64 + vst1.u16 {q10,q11}, [sp] + add r12, sp, r12, LSL #1 + vld1.u64 {d24}, [r12] + vld1.u64 {d25}, [r12] + vld1.u64 {d26}, [r12] + vld1.u64 {d27}, [r12] + vst1.u16 {q12,q13}, [r12] + vld1.u16 {q10,q11}, [sp] + add sp, sp, #64 + bx lr +1: vmov.u16 d24, d23 + vmov.u16 d25, d23 + vmov.u16 d26, d23 + vmov.u16 d27, d23 + bx lr +END(prefill_sweepright4) + +/* The main loop keeps a sliding window of data that has already been convolved + * in the vertical axis for the current line. This usually stays in the + * register file, but spills to memory for large windows. The first thing that + * needs to be done at start-up is to fill this window with image data, taking + * into account the padding needed if the left or right edges of the image fall + * within this window. + */ + +/* Because the window is in the register file writes to it cannot be indexed + * by another register. Consequently the fill loops are unrolled to address + * the registers directly. This macro distinguishes between writes to the + * register file and writes to the spill buffer (indicated by a destination + * register named xx). + */ +.macro prefill_out ra, rb, sra, srb, srb_hi + .ifc \ra,xx + .ifc \rb,xx + vst1.u16 {\sra,\srb}, [r9:128]! + .else + /* this case is used only for the last tap of uchar1 r=25 */ + /* discard \sra */ + vmov.u16 \rb, \srb_hi + .endif + .else + .ifnc \ra,\sra + vmov.u16 \ra, \sra + .endif + .ifnc \rb,\srb + vmov.u16 \rb, \srb + .endif + .endif +.endm + +/* This macro provides the list of registers representing the window, and the + * cases where the register file is too small and a spill buffer is used + * instead. + * Since several specialisations of each function are generated, this also + * culls superfluous iterations, and sets the variable `i` for subsequent + * macros indicating the current index into the window. + */ +.macro prefill_list, macro, nextmacro, max_r, step, label + .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label + .if windowsize >= (\line * 16) + .set i, windowsize - (\line * 16) +\label\macro\line: + prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step + .endif + .endm + .if \step > 1 + ifneeded \macro \nextmacro, 13, 12, xx, xx, \step, \label + ifneeded \macro \nextmacro, 12, 11, xx, xx, \step, \label + ifneeded \macro \nextmacro, 11, 10, xx, xx, \step, \label + ifneeded \macro \nextmacro, 10, 9, xx, xx, \step, \label + ifneeded \macro \nextmacro, 9, 8, xx, xx, \step, \label + ifneeded \macro \nextmacro, 8, 7, xx, xx, \step, \label + ifneeded \macro \nextmacro, 7, 6, xx, xx, \step, \label + ifneeded \macro \nextmacro, 6, 5, xx, xx, \step, \label + ifneeded \macro \nextmacro, 5, 4, xx, xx, \step, \label + ifneeded \macro \nextmacro, 4, 3, xx, xx, \step, \label + .else + /* q3 normally contains the coefficient table, but it's not fully + * used. In the uchar1, r=25 case the other half of q3 is used for + * the last two window taps to avoid falling out to memory. + */ + ifneeded \macro \nextmacro, 4, 3, xx, d7, \step, \label + .endif + ifneeded \macro \nextmacro, 3, 2, q4, q5, \step, \label + ifneeded \macro \nextmacro, 2, 1, q6, q7, \step, \label + ifneeded \macro \nextmacro, 1, 0, q8, q9, \step, \label + +\label\macro\()0: + b \label\()_end + .purgem ifneeded +.endm + +/* These macros represent the possible stages of filling the window. + * Each macro is unrolled enough times that it can fill the entire window + * itself, but normally it will have to hand control to subsequent macros + * part-way through and this is done using labels named \next and \after, where + * \next is the next macro starting at the same window position and \after is + * the next macro starting after the current window position. + */ + +/* leftfill: v8 and v9 contain the left padding value. While the window + * extends outside of the image on the left-hand side, and at least 16 more + * padding values are needed in the window, store v8 and v9 into the window. + * Otherwise skip forward to storing image data. + */ +.macro prefill_leftfill, next, after, ra, rb, step + cmp r10, #i+16 + blo \next + prefill_out \ra, \rb, q8, q9, d19 +.endm + +/* leftedge: The very first non-fill or partial-fill chunk from the image is + * already loaded (as it was used to calculate the left padding value), so + * store it here, and then drop into the regular load/store cycle in the next + * macro. + */ +.macro prefill_leftedge, next, after, ra, rb, step +1: prefill_out \ra, \rb, q10, q11, d23 + b \after +.endm + +/* dofetch: Copy chunks of the image into the window without any complications + * from edge conditions. + */ +.macro prefill_dofetch, next, after, ra, rb, step + cmp r11, #i+16 + bls \next + bl fetch_generic_asm + prefill_out \ra, \rb, q10, q11, d23 +.endm + +/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond + * the right-hand edge of the image. In that case sweep the last valid pixel + * across the rest of the chunk, and in either case prepare padding data in v12 + * and v13 for the next macro. This is done in fetch_clampright. + * This only happens once before going on to the next macro. + * Sometimes leftedge also covers the rightedge case, in which case this has + * to be skipped altogether. + */ +.macro prefill_rightedge, next, after, ra, rb, step + cmp r11, #i + bls \next + bl fetch_clampright\step + prefill_out \ra, \rb, q10, q11, d23 + b \after +.endm + +/* rightfill: The rest of the window is simply filled with right padding from + * v12 and v13. + */ +.macro prefill_rightfill, next, after, ra, rb, step + prefill_out \ra, \rb, q12, q13, d25 +.endm + +/* Here all of the macros above are unrolled and laid out in the proper order. + */ +.macro prefill_body, max_r, step, label + prefill_list leftfill, leftedge, \max_r, \step, \label + prefill_list leftedge, dofetch, \max_r, \step, \label + prefill_list dofetch, rightedge, \max_r, \step, \label + prefill_list rightedge, rightfill, \max_r, \step, \label + prefill_list rightfill, oops, \max_r, \step, \label +\label\()_end: +.endm + +/* Fill the convolution window with context data. The aim here is to load + * exactly 2*r columns, and in the main loop to read as many columns as will be + * written. This is complicated by the window being divided into chunks at + * register boundaries, and the need to handle cases when the input starts very + * close to the left or right (or both) edges of the image and the need to fill + * the spaces that leaves with left and right edge padding values. + * + * Input: + * r1 -- src + * r2 -- pitch + * r3 -- count + * r4 -- available image data right of src pointer + * r5 -- r + * r6 -- rup + * r7 -- rdn + * r8 -- available image data left of src pointer + * r9 -- buffer (if needed) + * Output: + * r4 -= min(inlen, count + windowsize - centertap) + * r1 += min(inlen, count + windowsize - centertap) + * Modifies: + * r10 -- fill start index in the window + * r11 -- fill stop index in the window + * r12 -- scratch + */ +.macro prefill step=1, max_r=25, label=xx +.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15) +.set centertap, (windowsize - \max_r * \step) + mov r10, #centertap + subs r10, r10, r8 + movlo r10, #0 + + subs r11, r4, #windowsize - centertap + movhs r11, #0 + add r11, r11, #windowsize + + /* r10 indicates where in the window legal image data begins. + * r11 indicates where in the window legal image date ends. + * When starting near the centre of a large image these would be + * zero and windowsize respectively, but when starting near the + * edges this can change. + * When starting on the leftmost pixel, r10 will be centertap. + * When starting on the rightmost pixel, r11 will be centertap+1. + */ + + /* r4 indicates how much data there is between the current pointers + * and the right edge of the image. The pointers currently point + * to the data needed at centertap. The subsequent code will + * consume (windowsize - r10) data, but only the data from + * centertap to windowsize comes out of r4's budget. + */ +1: subs r4, r4, #windowsize - centertap + movlo r4, #0 + + /* And the pointers need to rewind to the start of the window. + */ + sub r1, r1, #centertap + + /* Unless x8 indicated that there wasn't that much data available. + */ + add r1, r1, r10 + + + /* Get the first chunk, and add padding to align it to the window + * if necessary. + */ + bl fetch_clampleft\step + + /* Sometimes the start and the end of the window are in the same + * chunk. In that case both ends need filler at the outset. + */ + sub r12, r11, #1 + eor r12, r10, r12 + cmp r12, #16 + bllo prefill_sweepright\step + + /* Iterate through all the points in the window and fill them in + * with padding or image data as needed. + */ + prefill_body \max_r, \step, \label +.endm + +/* The main body of the convolve functions. Having already pre-filled the + * convolution window with 2*r input values, the logic settles into a regular + * pattern of reading and writing at a 1:1 rate until either input or output + * expires. The input leads the output by r values, so when processing all the + * way to the right-hand edge, or within r pixels of that edge, the input will + * run out first. In the case of very narrow images, or sub-windows starting + * near the right edge, the input may already have run out while the + * convolution window was being filled and this loop will start with a + * zero-length input. + * + * Once the input runs out, the rest of the output must be processed by padding + * the remainder of the window with pad value from the last valid pixel from + * the source. + * + * Input: + * r0 = dst + * r1 = src + * r2 = pitch + * r3 = count + * r4 = inlen + * r5 = r + * r6 = rup + * r7 = rdn + * r9 = buffer + * Modifies + * r8 = fetch code pointer + */ +.macro conv_body core, step=1, max_r=25, labelc="", labelnc="" + + /* If x4 >= x3 then there's no need for clipping. The main loop + * needs to exit when either x3 or x4 runs out, so clamp x4 to be + * no greater than x3 and use x4 for the loop. + * However, if x4 comes out of the loop with less than 16 bytes + * left, a partial read would be necessary to avoid reading beyond + * the end of the image. To avoid this, clamp x4 to the next + * multiple of 16, which is still sufficient to force it out of the + * loop but doesn't imply a rewind. + */ + add r12, r3, #15 + bic r12, r12, #15 + cmp r4, r12 + movhi r4, r12 + + /* First calculate the entry-point into the internal fetch logic. + * This is done so the same function can service several kernel + * sizes. + */ + ldr r8, 3f +1: add r8, r8, pc + sub r8, r5, LSL #5 + sub r8, r5, LSL #4 + cmp r5, r6 + cmpeq r5, r7 + beq 5f + + /* if (r != rup || r != rdn) then the address-clamping table should + * be used rather than the short-cut version. + */ + ldr r8, 3f+4 +2: add r8, r8, pc + sub r8, r5, LSL #6 + b 5f + .align 3 +3: .word \labelnc-1b-8 + .word \labelc-2b-8 + + /* Main loop: ... */ + .align 4 +3: /* first perform a vertical convolution from memory to get the next + * 16 taps of the horizontal window into the register file... + */ + fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8 + + /* ...then perform a horizontal convolution on that window to + * produce eight output bytes, and slide the window along. + * This has to be done twice to match the 16-way vertical pass. + * It would be preferable to have twice the work done in \core, but + * that would demand yet another variant on those macros and would + * perturb the register allocation severely. + */ + \core + vst1.u8 {d31}, [r0]! + \core + vst1.u8 {d31}, [r0]! + + sub r3, r3, #16 +5: subs r4, r4, #16 + bhi 3b + /* Here there's 16 or fewer bytes available before the edge of the + * source image. x4 holds that count minus 16 (because it was + * decremented before the first iteration ran). The last read may + * not be a whole chunk, and beyond that a fill value must be used. + * + * Of course, none of that matters if there's no more output to + * produce... + */ + cmp r3, #0 + beq 5f + + /* Oh well. */ + adds r4, r4, #16 + bne 1f + .if \step==1 + vdup.u16 q10, d19[3] + vdup.u16 q11, d19[3] + .else + vmov.u64 d20, d19 + vmov.u64 d21, d19 + vmov.u64 d22, d19 + vmov.u64 d23, d19 + .endif + b 3f + + /* To avoid reading past end of input, rewind pointers by (16-r4) + * to ensure that they're exactly 16 bytes from the edge. + */ +1: mov r11, r4 + bl fetch_clampright\step + /* Now to put this padding to use, perform any remaining + * iterations. This is done at half the rate of the main loop, + * because there's no longer pressure from a 16-lane window filler. + */ +3: \core + .if \step==1 + vdup.u16 q11, d23[3] + .else + vmov.u64 d22, d23 + .endif + subs r3, r3, #8 + blo 4f + vst1.u8 {d31}, [r0]! + bne 3b + b 5f + + /* If the final iteration contained 0 < l < 8 values, then perform + * a piecewise store of the final vector. + */ +4: tst r3, #4 + beq 1f + vst1.u32 {d31[0]}, [r0]! + vext.u8 d31, d31, d31, #4 +1: tst r3, #2 + beq 1f + vst1.u16 {d31[0]}, [r0]! + vext.u8 d31, d31, d31, #2 +1: tst r3, #1 + beq 5f + vst1.u8 {d31[0]}, [r0]! + vext.u8 d31, d31, d31, #1 +5: mov r0, #0 +.endm + +.irp r, TUNED_LIST1, 25 +PRIVATE(convolve1_\r) + push {r12,lr} + + prefill step=1, max_r=\r, label=.Lcnv1_\r + + conv_body core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r + + pop {r12,pc} +END(convolve1_\r) +.endr + +.irp r, TUNED_LIST4, 25 +PRIVATE(convolve4_\r) + push {r12,lr} + sub r9, sp, #0x200 + sub sp, sp, #0x200 + 0x400 + bic r9, r9, #0x3fc + + /* r9 now points to a 0x200 byte buffer on the stack whose address + * has the low 10 bits clear. This allows easy address calculation + * in the wrap-around cases. + */ + + prefill step=4, max_r=\r, label=.Lcnv4_\r + + conv_body core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r + + add sp, sp, #0x200 + 0x400 + pop {r12,pc} +END(convolve4_\r) +.endr + +/* void rsdIntrinsicBlurU1_K( + * void *out, // r0 + * void *in, // r1 + * size_t w, // r2 + * size_t h, // r3 + * size_t p, // [sp] + * size_t x, // [sp,#4] + * size_t y, // [sp,#8] + * size_t count, // [sp,#12] + * size_t r, // [sp,#16] + * uint16_t *tab); // [sp,#20] + */ +ENTRY(rsdIntrinsicBlurU1_K) + push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + vpush {d8-d15} + ldr r6, [sp,#112] // y + ldr r8, [sp,#108] // x + ldr r5, [sp,#120] // r + sub r4, r2, r8 // inlen = w - x + sub r7, r3, r6 // h - y + ldr r2, [sp,#104] // pitch + ldr r3, [sp,#116] // count + sub r7, r7, #1 // h - y - 1 + + ldr r12, [sp,#124] + + add r1, r1, r8 // src += x + + cmp r6, r5 + movhi r6, r5 // rup = min(r, y) + cmp r7, r5 + movhi r7, r5 // rdn = min(r, h - y - 1) + + vld1.u16 {d0,d1,d2,d3}, [r12]! + vld1.u16 {d4,d5,d6}, [r12]! + + adr lr, 1f + .irp r, TUNED_LIST1 + cmp r5, #\r + bls convolve1_\r + .endr + b convolve1_25 + +1: vpop {d8-d15} + pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +END(rsdIntrinsicBlurU1_K) + +/* void rsdIntrinsicBlurU4_K( + * void *out, // r0 + * void *in, // r1 + * size_t w, // r2 + * size_t h, // r3 + * size_t p, // [sp] + * size_t x, // [sp,#4] + * size_t y, // [sp,#8] + * size_t count, // [sp,#12] + * size_t r, // [sp,#16] + * uint16_t *tab); // [sp,#20] + */ +ENTRY(rsdIntrinsicBlurU4_K) + push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + vpush {d8-d15} + ldr r6, [sp,#112] // y + ldr r8, [sp,#108] // x + ldr r5, [sp,#120] // r + lsl r8, r8, #2 + rsb r4, r8, r2, LSL #2 // inlen = (w - x) + sub r7, r3, r6 // h - y + ldr r2, [sp,#104] // pitch + ldr r3, [sp,#116] // count + sub r7, r7, #1 // h - y - 1 + lsl r3, r3, #2 // count + + ldr r12, [sp,#124] + + add r1, r1, r8 // in += x + + cmp r6, r5 + movhi r6, r5 // rup = min(r, y) + cmp r7, r5 + movhi r7, r5 // rdn = min(r, h - y - 1) + + vld1.u16 {d0,d1,d2,d3}, [r12]! + vld1.u16 {d4,d5,d6}, [r12]! + + adr lr, 1f + .irp r, TUNED_LIST4 + cmp r5, #\r + bls convolve4_\r + .endr + b convolve4_25 + +1: vpop {d8-d15} + pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +END(rsdIntrinsicBlurU4_K) |