diff options
author | Simon Hosie <simon.hosie@arm.com> | 2014-03-16 12:24:44 -0700 |
---|---|---|
committer | Simon Hosie <simon.hosie@arm.com> | 2014-03-25 11:49:37 -0700 |
commit | ea76eb386a2d851d50be69ebeb7ae593f84a5be9 (patch) | |
tree | 4230fb10e6584ccc44c29e2004d1f178f33674f6 | |
parent | 0462a39371659d1eeed5eb48dd6d507760301c22 (diff) | |
download | rs-ea76eb386a2d851d50be69ebeb7ae593f84a5be9.tar.gz |
Make Blur AArch64 assembly position-independent.
Change-Id: I426fba9fff3ac165f5be5f78e2458dbc3b59ab02
-rw-r--r-- | cpu_ref/rsCpuIntrinsics_advsimd_Blur.S | 303 |
1 files changed, 150 insertions, 153 deletions
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S index 202f903a..c4a85c2c 100644 --- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S +++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S @@ -15,6 +15,7 @@ */ #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: +#define PRIVATE(f) .text; .align 4; .type f,#function; f: #define END(f) .size f, .-f; .set FRACTION_BITS, 7 @@ -54,7 +55,7 @@ * q0-q3 -- coefficient table * x13 = -pitch * x15 = top-row in - * x16 = bottom-row in + * x19 = bottom-row in * Output: * x1 += 16 * q10,q11 -- 16 convolved columns @@ -82,7 +83,7 @@ umull v12.4s, v14.4h, v0.h[0] ifcc sub \reg, \reg, x5, LSL #6 umull2 v13.4s, v14.8h, v0.h[0] - mov x11, x16 + mov x11, x19 umull v14.4s, v15.4h, v0.h[0] ifcc add \reg, \reg, x5, LSL #3 umull2 v15.4s, v15.8h, v0.h[0] @@ -101,7 +102,7 @@ uaddl v16.8h, v10.8b, v11.8b ifcc cmp x7, #i uaddl2 v11.8h, v10.16b, v11.16b - ifcc csel x11, x16, x11, lo + ifcc csel x11, x19, x11, lo umlal v12.4s, v16.4h, v\dreg\doth[\lane] umlal2 v13.4s, v16.8h, v\dreg\doth[\lane] // prfm PLDL1KEEP,[x10, #32] // TODO: confirm @@ -123,7 +124,7 @@ nop uqrshrn v10.4h, v12.4s, #16 - FRACTION_BITS add x15, x15, #16 uqrshrn2 v10.8h, v13.4s, #16 - FRACTION_BITS - add x16, x16, #16 + add x19, x19, #16 uqrshrn v11.4h, v14.4s, #16 - FRACTION_BITS uqrshrn2 v11.8h, v15.4s, #16 - FRACTION_BITS .endm /*}}}*/ @@ -142,16 +143,16 @@ nop * more data that won't be used and it means that rotating the window involves * more mov operations. * - * When the buffer gets too big the buffer at [r9] is used. + * When the buffer gets too big the buffer at [x9] is used. * * Input: * q4-q11 -- convoltion window - * r9 -- pointer to additional convolution window data + * x9 -- pointer to additional convolution window data * Output: - * r9 -- updated buffer pointer (if used) + * x9 -- updated buffer pointer (if used) * d31 -- result to be stored * Modifies: - * r12 -- temp buffer pointer + * x12 -- temp buffer pointer * q12-q13 -- temporaries for load and vext operations. * q14-q15 -- intermediate sums */ @@ -160,17 +161,19 @@ nop umull v14.4s, v9.4h, v0.h[0] umull2 v15.4s, v9.8h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f - .xword 107f - .xword 108f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b + .hword 107f-100b + .hword 108f-100b .align 4 108: umlal v14.4s, v8.4h, v1.h[0] umlal2 v15.4s, v8.8h, v1.h[0] @@ -232,25 +235,27 @@ nop umull v14.4s, v8.4h, v0.h[0] umull2 v15.4s, v8.8h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f - .xword 107f - .xword 108f - .xword 109f - .xword 110f - .xword 111f - .xword 112f - .xword 113f - .xword 114f - .xword 115f - .xword 116f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b + .hword 107f-100b + .hword 108f-100b + .hword 109f-100b + .hword 110f-100b + .hword 111f-100b + .hword 112f-100b + .hword 113f-100b + .hword 114f-100b + .hword 115f-100b + .hword 116f-100b .align 4 116: //ext v12.16b, v6.16b, v7.16b, #0*2 //ext v13.16b, v10.16b, v11.16b, #0*2 @@ -365,34 +370,36 @@ nop umull v14.4s, v12.4h, v0.h[0] umull2 v15.4s, v12.8h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f - .xword 107f - .xword 108f - .xword 109f - .xword 110f - .xword 111f - .xword 112f - .xword 113f - .xword 114f - .xword 115f - .xword 116f - .xword 117f - .xword 118f - .xword 119f - .xword 120f - .xword 121f - .xword 122f - .xword 123f - .xword 124f - .xword 125f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b + .hword 107f-100b + .hword 108f-100b + .hword 109f-100b + .hword 110f-100b + .hword 111f-100b + .hword 112f-100b + .hword 113f-100b + .hword 114f-100b + .hword 115f-100b + .hword 116f-100b + .hword 117f-100b + .hword 118f-100b + .hword 119f-100b + .hword 120f-100b + .hword 121f-100b + .hword 122f-100b + .hword 123f-100b + .hword 124f-100b + .hword 125f-100b .align 4 125: ext v12.16b, v3.16b, v4.16b, #6*2 ext v13.16b, v10.16b, v11.16b, #0*2 @@ -564,15 +571,17 @@ nop umull v14.4s, v7.4h, v0.h[0] umull2 v15.4s, v7.8h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b .align 4 106: umlal v14.4s, v4.4h, v0.h[6] umlal2 v15.4s, v4.8h, v0.h[6] @@ -616,21 +625,23 @@ nop umull v14.4s, v4.4h, v0.h[0] umull2 v15.4s, v4.8h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f - .xword 107f - .xword 108f - .xword 109f - .xword 110f - .xword 111f - .xword 112f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b + .hword 107f-100b + .hword 108f-100b + .hword 109f-100b + .hword 110f-100b + .hword 111f-100b + .hword 112f-100b .align 4 112: add x12, x9, #0x1a0 bic x12, x12, #0x200 @@ -751,34 +762,36 @@ nop umull v14.4s, v12.4h, v0.h[0] umull v15.4s, v13.4h, v0.h[0] - adr x12, 199f-8 - ldr x12, [x12, x5, LSL #3] + adr x16, 100f + ldrsh x12, [x16, x5, LSL #1] + add x12, x12, x16 br x12 - 199: .xword 101f - .xword 102f - .xword 103f - .xword 104f - .xword 105f - .xword 106f - .xword 107f - .xword 108f - .xword 109f - .xword 110f - .xword 111f - .xword 112f - .xword 113f - .xword 114f - .xword 115f - .xword 116f - .xword 117f - .xword 118f - .xword 119f - .xword 120f - .xword 121f - .xword 122f - .xword 123f - .xword 124f - .xword 125f + 100: .hword -4 + .hword 101f-100b + .hword 102f-100b + .hword 103f-100b + .hword 104f-100b + .hword 105f-100b + .hword 106f-100b + .hword 107f-100b + .hword 108f-100b + .hword 109f-100b + .hword 110f-100b + .hword 111f-100b + .hword 112f-100b + .hword 113f-100b + .hword 114f-100b + .hword 115f-100b + .hword 116f-100b + .hword 117f-100b + .hword 118f-100b + .hword 119f-100b + .hword 120f-100b + .hword 121f-100b + .hword 122f-100b + .hword 123f-100b + .hword 124f-100b + .hword 125f-100b .align 4 125: add x12, x9, #0x0d0 bic x12, x12, #0x200 @@ -1043,7 +1056,7 @@ nop /* Dedicated function wrapper for the fetch macro, for the cases where * performance isn't that important, to keep code size down. */ -ENTRY(fetch_generic_asm) +PRIVATE(fetch_generic_asm) stp x10, x11, [sp, #-16]! fetch ldp x10, x11, [sp], #16 @@ -1055,10 +1068,10 @@ END(fetch_generic_asm) * hand edge of the window when starting too close to the right hand edge of * the image. */ -ENTRY(prefetch_clamp1) +PRIVATE(prefetch_clamp1) sub x11, xzr, x11 sub x15, x15, x1 - sub x16, x16, x1 + sub x19, x19, x1 tbz x11, #3, 1f mov v11.16b, v10.16b sub x1, x1, #16 @@ -1084,14 +1097,14 @@ ENTRY(prefetch_clamp1) mov v11.16b, v12.16b 1: sub x11, xzr, x11 add x15, x15, x1 - add x16, x16, x1 + add x19, x19, x1 ret END(prefetch_clamp1) -ENTRY(prefetch_clamp4) +PRIVATE(prefetch_clamp4) sub x11, xzr, x11 sub x15, x15, x1 - sub x16, x16, x1 + sub x19, x19, x1 tbz x11, #3, 1f sub x1, x1, #16 // what's this? mov v11.16b, v10.16b @@ -1105,7 +1118,7 @@ ENTRY(prefetch_clamp4) mov v11.16b, v12.16b 1: sub x11, xzr, x11 add x15, x15, x1 - add x16, x16, x1 + add x19, x19, x1 ret END(prefetch_clamp4) @@ -1174,7 +1187,7 @@ END(prefetch_clamp4) * x9 -- buffer (if needed) * x13 = -pitch * x15 = top-row in - * x16 = bottom-row in + * x19 = bottom-row in * Output: * x1 += rlf + min(count, rrt) * Modifies: @@ -1221,11 +1234,11 @@ END(prefetch_clamp4) .endif 1: sub x1, x1, x10 sub x15, x15, x10 - sub x16, x16, x10 + sub x19, x19, x10 bic x10, x10, #15 add x1, x1, x10 add x15, x15, x10 - add x16, x16, x10 + add x19, x19, x10 2: .if \step > 1 /* it's only in the uchar2 and uchar4 cases where the register file @@ -1276,7 +1289,7 @@ END(prefetch_clamp4) * x9 = buffer * x13 = -pitch * x15 = top-row in - * x16 = bottom-row in + * x19 = bottom-row in * Modifies * x8 = fetch code pointer */ @@ -1324,10 +1337,10 @@ END(prefetch_clamp4) 1: sub x1, x1, #16 sub x15, x15, #16 - sub x16, x16, #16 + sub x19, x19, #16 add x1, x1, x4 add x15, x15, x4 - add x16, x16, x4 + add x19, x19, x4 bl fetch_generic_asm .if \step==1 @@ -1373,7 +1386,7 @@ END(prefetch_clamp4) .endm .irep r, TUNED_LIST1, 25 -ENTRY(convolve1_\r) +PRIVATE(convolve1_\r) stp x29,x30, [sp, #-16]! prefetch step=1, max_r=\r @@ -1386,7 +1399,7 @@ END(convolve1_\r) .endr .irep r, TUNED_LIST4, 25 -ENTRY(convolve4_\r) +PRIVATE(convolve4_\r) sub x12, sp, #0x200 bic x9, x12, #0x3fc mov sp, x9 @@ -1421,17 +1434,13 @@ END(convolve4_\r) * uint16_t *tab); // [sp,#8] */ ENTRY(rsdIntrinsicBlurU1_K) - stp x16,x30, [sp, #-80]! - stp x14,x15, [sp, #16] - stp x12,x13, [sp, #32] - stp x10,x11, [sp, #48] - stp x8,x9, [sp, #64] + stp x19,x30, [sp, #-16]! sub x8, sp, #32 sub sp, sp, #64 st1 {v8.1d - v11.1d}, [sp] st1 {v12.1d - v15.1d}, [x8] mov x8, x5 // x - ldr w5, [sp,#144] // r + ldr w5, [sp,#80] // r sub x9, x2, x8 sub x10, x3, x6 mov x2, x4 // pitch @@ -1439,7 +1448,7 @@ ENTRY(rsdIntrinsicBlurU1_K) sub x7, x10, #1 sub x9, x9, x3 - ldr x12, [sp, #152] // tab + ldr x12, [sp, #88] // tab add x0, x0, x8 add x1, x1, x8 @@ -1460,7 +1469,7 @@ ENTRY(rsdIntrinsicBlurU1_K) sub x13, xzr, x2 msub x15, x2, x6, x1 - madd x16, x2, x7, x1 + madd x19, x2, x7, x1 ld1 {v0.8h,v1.8h}, [x12], #32 ld1 {v2.8h,v3.8h}, [x12], #32 @@ -1474,11 +1483,7 @@ ENTRY(rsdIntrinsicBlurU1_K) 1: ld1 {v8.1d - v11.1d}, [sp], #32 ld1 {v12.1d - v15.1d}, [sp], #32 - ldp x8,x9, [sp, #64] - ldp x10,x11, [sp, #48] - ldp x12,x13, [sp, #32] - ldp x14,x15, [sp, #16] - ldp x12,x30, [sp], #80 + ldp x19,x30, [sp], #16 ret END(rsdIntrinsicBlurU1_K) @@ -1495,17 +1500,13 @@ END(rsdIntrinsicBlurU1_K) * uint16_t *tab); // [sp,#8] */ ENTRY(rsdIntrinsicBlurU4_K) - stp x16,x30, [sp, #-80]! - stp x14,x15, [sp, #16] - stp x12,x13, [sp, #32] - stp x10,x11, [sp, #48] - stp x8,x9, [sp, #64] + stp x19,x30, [sp, #-16]! sub x8, sp, #32 sub sp, sp, #64 st1 {v8.1d - v11.1d}, [sp] st1 {v12.1d - v15.1d}, [x8] mov x8, x5 // x - ldr w5, [sp,#144] // r + ldr w5, [sp,#80] // r sub x9, x2, x8 sub x10, x3, x6 mov x2, x4 // pitch @@ -1513,7 +1514,7 @@ ENTRY(rsdIntrinsicBlurU4_K) sub x7, x10, #1 sub x9, x9, x3 - ldr x12, [sp, #152] + ldr x12, [sp, #88] add x0, x0, x8, LSL #2 add x1, x1, x8, LSL #2 @@ -1535,7 +1536,7 @@ ENTRY(rsdIntrinsicBlurU4_K) sub x13, xzr, x2 msub x15, x2, x6, x1 - madd x16, x2, x7, x1 + madd x19, x2, x7, x1 ld1 {v0.8h,v1.8h}, [x12], #32 ld1 {v2.8h,v3.8h}, [x12], #32 @@ -1549,10 +1550,6 @@ ENTRY(rsdIntrinsicBlurU4_K) 1: ld1 {v8.1d - v11.1d}, [sp], #32 ld1 {v12.1d - v15.1d}, [sp], #32 - ldp x8,x9, [sp, #64] - ldp x10,x11, [sp, #48] - ldp x12,x13, [sp, #32] - ldp x14,x15, [sp, #16] - ldp x12,x30, [sp], #80 + ldp x19,x30, [sp], #16 ret END(rsdIntrinsicBlurU4_K) |