Optimise YuvToRGB using 16-bit arithmetic.

Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase throughput. Implementations in AArch32 and AArch64 NEON. Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
author: Simon Hosie <simon.hosie@arm.com> 2014-02-01 01:35:11 -0800
committer: Simon Hosie <simon.hosie@arm.com> 2014-03-04 10:15:10 -0800
commit: ccd7a46d0c0052209bf3ab8657f40622065d1d1f (patch)
tree: d00b94dfa2ec17dc6bd1bcf7a6cef04918c939ac /cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
parent: 5d06919bc8019322180ea34768a7a4137fa64d11 (diff)
download: rs-ccd7a46d0c0052209bf3ab8657f40622065d1d1f.tar.gz
1 files changed, 292 insertions, 0 deletions
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
new file mode 100644
index 00000000..9232a796
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively.  U and V are in
+ * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+        movi        v7.8b, #149
+
+        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
+        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
+
+        movi        v7.8b, #50
+        movi        v10.8b, #104
+        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
+        umlal       v8.8h, v17.8b, v10.8b
+
+        ushr        v7.8b, v17.8b, #1
+        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
+        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
+
+        ushll       v7.8h, v16.8b, #2
+        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
+        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
+
+        movi        v7.16b, #204
+        movi        v10.8b, #254
+        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
+        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
+
+        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
+        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
+        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
+
+        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
+        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
+        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b, v0.8h, #6
+        uqrshrn     v4.8b, v4.8h, #6
+        uqrshrn     v1.8b, v1.8h, #7
+        uqrshrn     v5.8b, v5.8h, #7
+        uqrshrn     v2.8b, v2.8h, #6
+        uqrshrn     v6.8b, v6.8h, #6
+
+        zip1        v0.16b, v0.16b, v4.16b
+        zip1        v1.16b, v1.16b, v5.16b
+        zip1        v2.16b, v2.16b, v6.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        dup         v13.8h, w5
+        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        dup         v14.8h, w5
+        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        dup         v15.8h, w5
+
+        movi        v3.16b, #0xff
+
+        subs        x2, x2, #16
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      ld2         {v8.8b,v9.8b}, [x1], #16
+//      prfm PLDL1STRM, [x1, #256]
+  .if \interleaved
+    .if \swapuv
+        ld2         {v17.8b,v18.8b}, [x3], #16
+        mov         v16.8b, v18.8b
+    .else
+        ld2         {v16.8b,v17.8b}, [x3], #16
+    .endif
+//      prfm PLD1STRM,  [x3, #256]
+  .else
+        ld1         {v16.8b}, [x3], #8
+        ld1         {v17.8b}, [x4], #8
+//      prfm PLD1STRM,  [x3, #128]
+//      prfm PLD1STRM,  [x4, #128]
+  .endif
+
+        \kernel
+
+        subs        x2, x2, #16
+
+        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+
+        bhs         1b
+
+2:      adds        x2, x2, #16
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 16
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        movi        v8.8b, #0
+        movi        v9.8b, #0
+        movi        v16.8b, #0
+        movi        v17.8b, #0
+
+        tbz         x2, #3, 1f
+        ld1         {v9.8b}, [x1], #8
+  .if \interleaved
+        ld1         {v17.8b}, [x3], #8
+  .else
+        ld1         {v16.s}[1], [x3], #4
+        ld1         {v17.s}[1], [x4], #4
+  .endif
+1:      tbz         x2, #2, 1f
+        ld1         {v8.s}[1], [x1], #4
+  .if \interleaved
+        ld1         {v16.s}[1], [x3], #4
+  .else
+        ld1         {v16.h}[1], [x3], #2
+        ld1         {v17.h}[1], [x4], #2
+  .endif
+1:      tbz         x2, #1, 1f
+        ld1         {v8.h}[1], [x1], #2
+  .if \interleaved
+        ld1         {v16.h}[1], [x3], #2
+  .else
+        ld1         {v16.b}[1], [x3], #1
+        ld1         {v17.b}[1], [x4], #1
+  .endif
+1:      tbz         x2, #0, 1f
+        ld1         {v8.b}[1], [x1], #1
+  .if \interleaved
+        ld1         {v16.b}[1], [x3], #1
+  .else
+        ld1         {v16.b}[0], [x3], #1
+        ld1         {v17.b}[0], [x4], #1
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      uzp1        v8.16b, v8.16b, v9.16b
+  .if \interleaved
+    .if \swapuv
+        uzp1        v16.16b, v17.16b, v16.16b
+    .else
+        uzp1        v16.16b, v16.16b, v17.16b
+    .endif
+  .endif
+
+        \kernel
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        zip1        v4.16b, v0.16b, v2.16b
+        zip2        v6.16b, v0.16b, v2.16b
+        zip1        v5.16b, v1.16b, v3.16b
+        zip2        v7.16b, v1.16b, v3.16b
+        zip1        v0.16b, v4.16b, v5.16b
+        zip2        v1.16b, v4.16b, v5.16b
+        zip1        v2.16b, v6.16b, v7.16b
+        zip2        v3.16b, v6.16b, v7.16b
+
+1:      tbz         x2, #3, 1f
+        st1         {v2.16b,v3.16b}, [x0], #32
+1:      tbz         x2, #2, 1f
+        st1         {v1.16b}, [x0], #16
+1:      tbz         x2, #1, 1f
+        st1         {v0.d}[1], [x0], #8
+1:      tbz         x2, #0, 2f
+        st1         {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uin,    // x2
+ *          void const *vin,    // x3
+ *          size_t xstart,      // x4
+ *          size_t xend);       // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        lsr         x6, x4, #1
+        add         x0, x0, x4, LSL #2
+        add         x1, x1, x4
+        add         x4, x3, x6
+        add         x3, x2, x6
+        sub         x2, x5, x6, LSL #2
+
+        sub         x6, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x6]
+
+        wrap_line yuvkern, 0
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        bic         x5, x4, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        bic         x5, x4, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuvR_K)
author	Simon Hosie <simon.hosie@arm.com>	2014-02-01 01:35:11 -0800
committer	Simon Hosie <simon.hosie@arm.com>	2014-03-04 10:15:10 -0800
commit	ccd7a46d0c0052209bf3ab8657f40622065d1d1f (patch)
tree	d00b94dfa2ec17dc6bd1bcf7a6cef04918c939ac /cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
parent	5d06919bc8019322180ea34768a7a4137fa64d11 (diff)
download	rs-ccd7a46d0c0052209bf3ab8657f40622065d1d1f.tar.gz