summaryrefslogtreecommitdiff
path: root/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
diff options
context:
space:
mode:
authorSimon Hosie <simon.hosie@arm.com>2014-02-01 01:35:11 -0800
committerSimon Hosie <simon.hosie@arm.com>2014-03-04 10:15:10 -0800
commitccd7a46d0c0052209bf3ab8657f40622065d1d1f (patch)
treed00b94dfa2ec17dc6bd1bcf7a6cef04918c939ac /cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
parent5d06919bc8019322180ea34768a7a4137fa64d11 (diff)
downloadrs-ccd7a46d0c0052209bf3ab8657f40622065d1d1f.tar.gz
Optimise YuvToRGB using 16-bit arithmetic.
Reimplement YuvToRGB intrinsic using 16-bit SIMD arithmetic to increase throughput. Implementations in AArch32 and AArch64 NEON. Change-Id: Idd43e383f5147c33b0b546fa822c970de432c19d
Diffstat (limited to 'cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S')
-rw-r--r--cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S292
1 files changed, 292 insertions, 0 deletions
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
new file mode 100644
index 00000000..9232a796
--- /dev/null
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register. This macro will be called from within several different wrapper
+ * variants for different data layouts. Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively. U and V are in
+ * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+ movi v7.8b, #149
+
+ umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149
+ umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149
+
+ movi v7.8b, #50
+ movi v10.8b, #104
+ umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104
+ umlal v8.8h, v17.8b, v10.8b
+
+ ushr v7.8b, v17.8b, #1
+ uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1)
+ uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1)
+
+ ushll v7.8h, v16.8b, #2
+ add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2)
+ add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2)
+
+ movi v7.16b, #204
+ movi v10.8b, #254
+ umull v11.8h, v17.8b, v7.8b // r2 = v * 204
+ umull v12.8h, v16.8b, v10.8b // b2 = u * 254
+
+ uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1
+ uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1
+ uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1
+ uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1
+
+ uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
+ uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2)
+ uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqrshrn v0.8b, v0.8h, #6
+ uqrshrn v4.8b, v4.8h, #6
+ uqrshrn v1.8b, v1.8h, #7
+ uqrshrn v5.8b, v5.8h, #7
+ uqrshrn v2.8b, v2.8h, #6
+ uqrshrn v6.8b, v6.8h, #6
+
+ zip1 v0.16b, v0.16b, v4.16b
+ zip1 v1.16b, v1.16b, v5.16b
+ zip1 v2.16b, v2.16b, v6.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+ mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ dup v13.8h, w5
+ mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ dup v14.8h, w5
+ mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ dup v15.8h, w5
+
+ movi v3.16b, #0xff
+
+ subs x2, x2, #16
+ bhs 1f
+ b 2f
+
+ .align 4
+1: ld2 {v8.8b,v9.8b}, [x1], #16
+// prfm PLDL1STRM, [x1, #256]
+ .if \interleaved
+ .if \swapuv
+ ld2 {v17.8b,v18.8b}, [x3], #16
+ mov v16.8b, v18.8b
+ .else
+ ld2 {v16.8b,v17.8b}, [x3], #16
+ .endif
+// prfm PLD1STRM, [x3, #256]
+ .else
+ ld1 {v16.8b}, [x3], #8
+ ld1 {v17.8b}, [x4], #8
+// prfm PLD1STRM, [x3, #128]
+// prfm PLD1STRM, [x4, #128]
+ .endif
+
+ \kernel
+
+ subs x2, x2, #16
+
+ st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+
+ bhs 1b
+
+2: adds x2, x2, #16
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 16
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the
+ * interaction between neighbouring pixels is constrained to odd
+ * boundaries where the load operations don't interfere.
+ */
+ movi v8.8b, #0
+ movi v9.8b, #0
+ movi v16.8b, #0
+ movi v17.8b, #0
+
+ tbz x2, #3, 1f
+ ld1 {v9.8b}, [x1], #8
+ .if \interleaved
+ ld1 {v17.8b}, [x3], #8
+ .else
+ ld1 {v16.s}[1], [x3], #4
+ ld1 {v17.s}[1], [x4], #4
+ .endif
+1: tbz x2, #2, 1f
+ ld1 {v8.s}[1], [x1], #4
+ .if \interleaved
+ ld1 {v16.s}[1], [x3], #4
+ .else
+ ld1 {v16.h}[1], [x3], #2
+ ld1 {v17.h}[1], [x4], #2
+ .endif
+1: tbz x2, #1, 1f
+ ld1 {v8.h}[1], [x1], #2
+ .if \interleaved
+ ld1 {v16.h}[1], [x3], #2
+ .else
+ ld1 {v16.b}[1], [x3], #1
+ ld1 {v17.b}[1], [x4], #1
+ .endif
+1: tbz x2, #0, 1f
+ ld1 {v8.b}[1], [x1], #1
+ .if \interleaved
+ ld1 {v16.b}[1], [x3], #1
+ .else
+ ld1 {v16.b}[0], [x3], #1
+ ld1 {v17.b}[0], [x4], #1
+ .endif
+
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point if necessary.
+ */
+1: uzp1 v8.16b, v8.16b, v9.16b
+ .if \interleaved
+ .if \swapuv
+ uzp1 v16.16b, v17.16b, v16.16b
+ .else
+ uzp1 v16.16b, v16.16b, v17.16b
+ .endif
+ .endif
+
+ \kernel
+
+ /* As above but with the output; structured stores for partial vectors
+ * aren't available, so the data is re-packed first and stored linearly.
+ */
+ zip1 v4.16b, v0.16b, v2.16b
+ zip2 v6.16b, v0.16b, v2.16b
+ zip1 v5.16b, v1.16b, v3.16b
+ zip2 v7.16b, v1.16b, v3.16b
+ zip1 v0.16b, v4.16b, v5.16b
+ zip2 v1.16b, v4.16b, v5.16b
+ zip1 v2.16b, v6.16b, v7.16b
+ zip2 v3.16b, v6.16b, v7.16b
+
+1: tbz x2, #3, 1f
+ st1 {v2.16b,v3.16b}, [x0], #32
+1: tbz x2, #2, 1f
+ st1 {v1.16b}, [x0], #16
+1: tbz x2, #1, 1f
+ st1 {v0.d}[1], [x0], #8
+1: tbz x2, #0, 2f
+ st1 {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/* void rsdIntrinsicYuv2_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uin, // x2
+ * void const *vin, // x3
+ * size_t xstart, // x4
+ * size_t xend); // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+ lsr x6, x4, #1
+ add x0, x0, x4, LSL #2
+ add x1, x1, x4
+ add x4, x3, x6
+ add x3, x2, x6
+ sub x2, x5, x6, LSL #2
+
+ sub x6, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x6]
+
+ wrap_line yuvkern, 0
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv2_K)
+
+/* void rsdIntrinsicYuv_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+ bic x5, x4, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuv_K)
+
+/* void rsdIntrinsicYuvR_K(
+ * void *out, // x0
+ * void const *yin, // x1
+ * void const *uvin, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+ bic x5, x4, #1
+ add x0, x0, x5, LSL #2
+ add x1, x1, x5
+ add x3, x2, x5
+ sub x2, x4, x5
+
+ sub x5, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x5]
+
+ wrap_line yuvkern, 1
+
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ret
+END(rsdIntrinsicYuvR_K)