diff options
author | Simon Hosie <simon.hosie@arm.com> | 2014-02-01 01:35:11 -0800 |
---|---|---|
committer | Simon Hosie <simon.hosie@arm.com> | 2014-03-04 10:15:10 -0800 |
commit | ccd7a46d0c0052209bf3ab8657f40622065d1d1f (patch) | |
tree | d00b94dfa2ec17dc6bd1bcf7a6cef04918c939ac /cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S | |
parent | 5d06919bc8019322180ea34768a7a4137fa64d11 (diff) | |
download | rs-ccd7a46d0c0052209bf3ab8657f40622065d1d1f.tar.gz |
Optimise YuvToRGB using 16-bit arithmetic.
Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase
throughput. Implementations in AArch32 and AArch64 NEON.
Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
Diffstat (limited to 'cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S')
-rw-r--r-- | cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S new file mode 100644 index 00000000..9232a796 --- /dev/null +++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S @@ -0,0 +1,292 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: +#define END(f) .size f, .-f; + +/* Perform the actual YuvToRGB conversion in a macro, from register to + * register. This macro will be called from within several different wrapper + * variants for different data layouts. Y data starts with the even and odd + * bytes split into the low parts of v8 and v9 respectively. U and V are in + * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is + * pre-loaded with a constant 0xff alpha channel. + * + * The complicated arithmetic is the result of refactoring the original + * equations to avoid 16-bit overflow without losing any precision. + */ +.macro yuvkern + movi v7.8b, #149 + + umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149 + umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149 + + movi v7.8b, #50 + movi v10.8b, #104 + umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104 + umlal v8.8h, v17.8b, v10.8b + + ushr v7.8b, v17.8b, #1 + uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1) + uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1) + + ushll v7.8h, v16.8b, #2 + add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2) + add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2) + + movi v7.16b, #204 + movi v10.8b, #254 + umull v11.8h, v17.8b, v7.8b // r2 = v * 204 + umull v12.8h, v16.8b, v10.8b // b2 = u * 254 + + uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1 + uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1 + uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1 + uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1 + + uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) + uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2) + uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + + uqrshrn v0.8b, v0.8h, #6 + uqrshrn v4.8b, v4.8h, #6 + uqrshrn v1.8b, v1.8h, #7 + uqrshrn v5.8b, v5.8h, #7 + uqrshrn v2.8b, v2.8h, #6 + uqrshrn v6.8b, v6.8h, #6 + + zip1 v0.16b, v0.16b, v4.16b + zip1 v1.16b, v1.16b, v5.16b + zip1 v2.16b, v2.16b, v6.16b +.endm + +/* Define the wrapper code which will load and store the data, iterate the + * correct number of times, and safely handle the remainder at the end of the + * loop. Some sections of code are switched out depending on the data packing + * being handled. + */ +.macro wrap_line kernel, interleaved=0, swapuv=0 + + mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) + dup v13.8h, w5 + mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) + dup v14.8h, w5 + mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) + dup v15.8h, w5 + + movi v3.16b, #0xff + + subs x2, x2, #16 + bhs 1f + b 2f + + .align 4 +1: ld2 {v8.8b,v9.8b}, [x1], #16 +// prfm PLDL1STRM, [x1, #256] + .if \interleaved + .if \swapuv + ld2 {v17.8b,v18.8b}, [x3], #16 + mov v16.8b, v18.8b + .else + ld2 {v16.8b,v17.8b}, [x3], #16 + .endif +// prfm PLD1STRM, [x3, #256] + .else + ld1 {v16.8b}, [x3], #8 + ld1 {v17.8b}, [x4], #8 +// prfm PLD1STRM, [x3, #128] +// prfm PLD1STRM, [x4, #128] + .endif + + \kernel + + subs x2, x2, #16 + + st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 + + bhs 1b + +2: adds x2, x2, #16 + beq 2f + + /* To handle the tail portion of the data (something less than 16 + * bytes) load small power-of-two chunks into working registers. It + * doesn't matter where they end up in the register; the same process + * will store them back out using the same positions and the + * interaction between neighbouring pixels is constrained to odd + * boundaries where the load operations don't interfere. + */ + movi v8.8b, #0 + movi v9.8b, #0 + movi v16.8b, #0 + movi v17.8b, #0 + + tbz x2, #3, 1f + ld1 {v9.8b}, [x1], #8 + .if \interleaved + ld1 {v17.8b}, [x3], #8 + .else + ld1 {v16.s}[1], [x3], #4 + ld1 {v17.s}[1], [x4], #4 + .endif +1: tbz x2, #2, 1f + ld1 {v8.s}[1], [x1], #4 + .if \interleaved + ld1 {v16.s}[1], [x3], #4 + .else + ld1 {v16.h}[1], [x3], #2 + ld1 {v17.h}[1], [x4], #2 + .endif +1: tbz x2, #1, 1f + ld1 {v8.h}[1], [x1], #2 + .if \interleaved + ld1 {v16.h}[1], [x3], #2 + .else + ld1 {v16.b}[1], [x3], #1 + ld1 {v17.b}[1], [x4], #1 + .endif +1: tbz x2, #0, 1f + ld1 {v8.b}[1], [x1], #1 + .if \interleaved + ld1 {v16.b}[1], [x3], #1 + .else + ld1 {v16.b}[0], [x3], #1 + ld1 {v17.b}[0], [x4], #1 + .endif + + /* One small impediment in the process above is that some of the load + * operations can't perform byte-wise structure deinterleaving at the + * same time as loading only part of a register. So the data is loaded + * linearly and unpacked manually at this point if necessary. + */ +1: uzp1 v8.16b, v8.16b, v9.16b + .if \interleaved + .if \swapuv + uzp1 v16.16b, v17.16b, v16.16b + .else + uzp1 v16.16b, v16.16b, v17.16b + .endif + .endif + + \kernel + + /* As above but with the output; structured stores for partial vectors + * aren't available, so the data is re-packed first and stored linearly. + */ + zip1 v4.16b, v0.16b, v2.16b + zip2 v6.16b, v0.16b, v2.16b + zip1 v5.16b, v1.16b, v3.16b + zip2 v7.16b, v1.16b, v3.16b + zip1 v0.16b, v4.16b, v5.16b + zip2 v1.16b, v4.16b, v5.16b + zip1 v2.16b, v6.16b, v7.16b + zip2 v3.16b, v6.16b, v7.16b + +1: tbz x2, #3, 1f + st1 {v2.16b,v3.16b}, [x0], #32 +1: tbz x2, #2, 1f + st1 {v1.16b}, [x0], #16 +1: tbz x2, #1, 1f + st1 {v0.d}[1], [x0], #8 +1: tbz x2, #0, 2f + st1 {v0.s}[1], [x0], #4 +2: +.endm + + +/* void rsdIntrinsicYuv2_K( + * void *out, // x0 + * void const *yin, // x1 + * void const *uin, // x2 + * void const *vin, // x3 + * size_t xstart, // x4 + * size_t xend); // x5 + */ +ENTRY(rsdIntrinsicYuv2_K) + lsr x6, x4, #1 + add x0, x0, x4, LSL #2 + add x1, x1, x4 + add x4, x3, x6 + add x3, x2, x6 + sub x2, x5, x6, LSL #2 + + sub x6, sp, #32 + sub sp, sp, #64 + st1 {v8.1d - v11.1d}, [sp] + st1 {v12.1d - v15.1d}, [x6] + + wrap_line yuvkern, 0 + + ld1 {v8.1d - v11.1d}, [sp], #32 + ld1 {v12.1d - v15.1d}, [sp], #32 + ret +END(rsdIntrinsicYuv2_K) + +/* void rsdIntrinsicYuv_K( + * void *out, // x0 + * void const *yin, // x1 + * void const *uvin, // x2 + * size_t xstart, // x3 + * size_t xend); // x4 + */ +ENTRY(rsdIntrinsicYuv_K) + bic x5, x4, #1 + add x0, x0, x5, LSL #2 + add x1, x1, x5 + add x3, x2, x5 + sub x2, x4, x5 + + sub x5, sp, #32 + sub sp, sp, #64 + st1 {v8.1d - v11.1d}, [sp] + st1 {v12.1d - v15.1d}, [x5] + + wrap_line yuvkern, 1, 1 + + ld1 {v8.1d - v11.1d}, [sp], #32 + ld1 {v12.1d - v15.1d}, [sp], #32 + ret +END(rsdIntrinsicYuv_K) + +/* void rsdIntrinsicYuvR_K( + * void *out, // x0 + * void const *yin, // x1 + * void const *uvin, // x2 + * size_t xstart, // x3 + * size_t xend); // x4 + */ +ENTRY(rsdIntrinsicYuvR_K) + bic x5, x4, #1 + add x0, x0, x5, LSL #2 + add x1, x1, x5 + add x3, x2, x5 + sub x2, x4, x5 + + sub x5, sp, #32 + sub sp, sp, #64 + st1 {v8.1d - v11.1d}, [sp] + st1 {v12.1d - v15.1d}, [x5] + + wrap_line yuvkern, 1 + + ld1 {v8.1d - v11.1d}, [sp], #32 + ld1 {v12.1d - v15.1d}, [sp], #32 + ret +END(rsdIntrinsicYuvR_K) |