aboutsummaryrefslogtreecommitdiff
path: root/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
diff options
context:
space:
mode:
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Blend_advsimd.S')
-rw-r--r--renderscript-toolkit/src/main/cpp/Blend_advsimd.S622
1 files changed, 622 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Blend_advsimd.S b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
new file mode 100644
index 0000000..e5cb29b
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+#define BLEND_LIST(X) \
+ X(0, CLEAR) \
+ X(1, SRC) \
+ X(2, DST) \
+ X(3, SRC_OVER) \
+ X(4, DST_OVER) \
+ X(5, SRC_IN) \
+ X(6, DST_IN) \
+ X(7, SRC_OUT) \
+ X(8, DST_OUT) \
+ X(9, SRC_ATOP) \
+ X(10, DST_ATOP) \
+ X(11, XOR) \
+ X(12, MULTIPLY) \
+ X(13, ADD) \
+ X(14, SUBTRACT)
+
+/* This operation was not enabled in the original RenderScript. We could
+ * enable it.
+ *
+ * X(15, DIFFERENCE) \
+ */
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component. The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer. Both have already been split out
+ * into one colour component per register (if necessary). q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ * zipped=0 -- The macro does not require the RGBA components to be
+ * separated.
+ * lddst=0 -- The macro does not require data from the destination buffer.
+ * ldsrc=0 -- The macro does not require data from the source buffer.
+ * nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ * inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+ movi v0.16b, #0
+ movi v1.16b, #0
+ movi v2.16b, #0
+ movi v3.16b, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+ mov v0.16b, v8.16b
+ mov v1.16b, v9.16b
+ mov v2.16b, v10.16b
+ mov v3.16b, v11.16b
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+ /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+ mvn v7.16b, v11.16b
+
+ umull2 v12.8h, v7.16b, v0.16b
+ umull v0.8h, v7.8b, v0.8b
+ umull2 v13.8h, v7.16b, v1.16b
+ umull v1.8h, v7.8b, v1.8b
+ umull2 v14.8h, v7.16b, v2.16b
+ umull v2.8h, v7.8b, v2.8b
+ umull2 v15.8h, v7.16b, v3.16b
+ umull v3.8h, v7.8b, v3.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+ mvn v7.16b, v3.16b
+
+ umull2 v12.8h, v7.16b, v8.16b
+ umull v8.8h, v7.8b, v8.8b
+ umull2 v13.8h, v7.16b, v9.16b
+ umull v9.8h, v7.8b, v9.8b
+ umull2 v14.8h, v7.16b, v10.16b
+ umull v10.8h, v7.8b, v10.8b
+ umull2 v15.8h, v7.16b, v11.16b
+ umull v11.8h, v7.8b, v11.8b
+
+ rshrn v4.8b, v8.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v9.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v10.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v11.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v8.8h, v8.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v9.8h, v9.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v10.8h, v10.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v11.8h, v11.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v8.8b, v8.8h, #8
+ rshrn2 v8.16b, v12.8h, #8
+ rshrn v9.8b, v9.8h, #8
+ rshrn2 v9.16b, v13.8h, #8
+ rshrn v10.8b, v10.8h, #8
+ rshrn2 v10.16b, v14.8h, #8
+ rshrn v11.8b, v11.8h, #8
+ rshrn2 v11.16b, v15.8h, #8
+
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+ umull2 v12.8h, v3.16b, v8.16b
+ umull v0.8h, v3.8b, v8.8b
+ umull2 v13.8h, v3.16b, v9.16b
+ umull v1.8h, v3.8b, v9.8b
+ umull2 v14.8h, v3.16b, v10.16b
+ umull v2.8h, v3.8b, v10.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+ umull2 v12.8h, v0.16b, v11.16b
+ umull v0.8h, v0.8b, v11.8b
+ umull2 v13.8h, v1.16b, v11.16b
+ umull v1.8h, v1.8b, v11.8b
+ umull2 v14.8h, v2.16b, v11.16b
+ umull v2.8h, v2.8b, v11.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+ mvn v3.16b, v3.16b
+ blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+ mvn v11.16b, v11.16b
+ blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+ mvn v11.16b, v11.16b
+
+ umull2 v12.8h, v11.16b, v0.16b
+ umull v0.8h, v11.8b, v0.8b
+ umull2 v13.8h, v11.16b, v1.16b
+ umull v1.8h, v11.8b, v1.8b
+ umull2 v14.8h, v11.16b, v2.16b
+ umull v2.8h, v11.8b, v2.8b
+
+ umull2 v4.8h, v3.16b, v8.16b
+ umull v8.8h, v3.8b, v8.8b
+ umull2 v5.8h, v3.16b, v9.16b
+ umull v9.8h, v3.8b, v9.8b
+ umull2 v6.8h, v3.16b, v10.16b
+ umull v10.8h, v3.8b, v10.8b
+
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v14.8h, v14.8h, v6.8h
+ uqadd v2.8h, v2.8h, v10.8h
+
+ urshr v8.8h, v0.8h, #8
+ urshr v4.8h, v12.8h, #8
+ urshr v9.8h, v1.8h, #8
+ urshr v5.8h, v13.8h, #8
+ urshr v10.8h, v2.8h, #8
+ urshr v6.8h, v14.8h, #8
+
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v2.8h, v2.8h, v10.8h
+ uqadd v14.8h, v14.8h, v6.8h
+
+ uqrshrn v0.8b, v0.8h, #8
+ uqrshrn2 v0.16b, v12.8h, #8
+ uqrshrn v1.8b, v1.8h, #8
+ uqrshrn2 v1.16b, v13.8h, #8
+ uqrshrn v2.8b, v2.8h, #8
+ uqrshrn2 v2.16b, v14.8h, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+ mvn v3.16b, v3.16b
+
+ umull2 v12.8h, v11.16b, v0.16b
+ umull v0.8h, v11.8b, v0.8b
+ umull2 v13.8h, v11.16b, v1.16b
+ umull v1.8h, v11.8b, v1.8b
+ umull2 v14.8h, v11.16b, v2.16b
+ umull v2.8h, v11.8b, v2.8b
+
+ umull2 v4.8h, v3.16b, v8.16b
+ umull v8.8h, v3.8b, v8.8b
+ umull2 v5.8h, v3.16b, v9.16b
+ umull v9.8h, v3.8b, v9.8b
+ umull2 v6.8h, v3.16b, v10.16b
+ umull v10.8h, v3.8b, v10.8b
+
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v14.8h, v14.8h, v6.8h
+ uqadd v2.8h, v2.8h, v10.8h
+
+ urshr v8.8h, v0.8h, #8
+ urshr v4.8h, v12.8h, #8
+ urshr v9.8h, v1.8h, #8
+ urshr v5.8h, v13.8h, #8
+ urshr v10.8h, v2.8h, #8
+ urshr v6.8h, v14.8h, #8
+
+ uqadd v0.8h, v0.8h, v8.8h
+ uqadd v12.8h, v12.8h, v4.8h
+ uqadd v1.8h, v1.8h, v9.8h
+ uqadd v13.8h, v13.8h, v5.8h
+ uqadd v2.8h, v2.8h, v10.8h
+ uqadd v14.8h, v14.8h, v6.8h
+
+ uqrshrn v0.8b, v0.8h, #8
+ uqrshrn2 v0.16b, v12.8h, #8
+ uqrshrn v1.8b, v1.8h, #8
+ uqrshrn2 v1.16b, v13.8h, #8
+ uqrshrn v2.8b, v2.8h, #8
+ uqrshrn2 v2.16b, v14.8h, #8
+
+ mov v3.16b, v11.16b
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+ umull2 v12.8h, v0.16b, v8.16b
+ umull v0.8h, v0.8b, v8.8b
+ umull2 v13.8h, v1.16b, v9.16b
+ umull v1.8h, v1.8b, v9.8b
+ umull2 v14.8h, v2.16b, v10.16b
+ umull v2.8h, v2.8b, v10.8b
+ umull2 v15.8h, v3.16b, v11.16b
+ umull v3.8h, v3.8b, v11.8b
+
+ rshrn v4.8b, v0.8h, #8
+ rshrn2 v4.16b, v12.8h, #8
+ rshrn v5.8b, v1.8h, #8
+ rshrn2 v5.16b, v13.8h, #8
+ rshrn v6.8b, v2.8h, #8
+ rshrn2 v6.16b, v14.8h, #8
+ rshrn v7.8b, v3.8h, #8
+ rshrn2 v7.16b, v15.8h, #8
+
+ uaddw v0.8h, v0.8h, v4.8b
+ uaddw2 v12.8h, v12.8h, v4.16b
+ uaddw v1.8h, v1.8h, v5.8b
+ uaddw2 v13.8h, v13.8h, v5.16b
+ uaddw v2.8h, v2.8h, v6.8b
+ uaddw2 v14.8h, v14.8h, v6.16b
+ uaddw v3.8h, v3.8h, v7.8b
+ uaddw2 v15.8h, v15.8h, v7.16b
+
+ rshrn v0.8b, v0.8h, #8
+ rshrn2 v0.16b, v12.8h, #8
+ rshrn v1.8b, v1.8h, #8
+ rshrn2 v1.16b, v13.8h, #8
+ rshrn v2.8b, v2.8h, #8
+ rshrn2 v2.16b, v14.8h, #8
+ rshrn v3.8b, v3.8h, #8
+ rshrn2 v3.16b, v15.8h, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+ uqadd v0.16b, v0.16b, v8.16b
+ uqadd v1.16b, v1.16b, v9.16b
+ uqadd v2.16b, v2.16b, v10.16b
+ uqadd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+ uqsub v0.16b, v0.16b, v8.16b
+ uqsub v1.16b, v1.16b, v9.16b
+ uqsub v2.16b, v2.16b, v10.16b
+ uqsub v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+ uabd v0.16b, v0.16b, v8.16b
+ uabd v1.16b, v1.16b, v9.16b
+ uabd v2.16b, v2.16b, v10.16b
+ uabd v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+ eor v0.16b, v0.16b, v8.16b
+ eor v1.16b, v1.16b, v9.16b
+ eor v2.16b, v2.16b, v10.16b
+ eor v3.16b, v3.16b, v11.16b
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop. Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+ \kernel
+.else
+ sub x3, sp, #32
+ sub sp, sp, #64
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x3]
+ subs x2, x2, #64
+ b 2f
+.align 4
+1:
+ .if \lddst
+ .if \zipped
+ ld4 {v0.16b - v3.16b}, [x0]
+ .else
+ ld1 {v0.16b - v3.16b}, [x0]
+ .endif
+ .endif
+ .if \ldsrc
+ .if \zipped
+ ld4 {v8.16b - v11.16b}, [x1], #64
+ .else
+ ld1 {v8.16b - v11.16b}, [x1], #64
+ .endif
+ .endif
+ .if \pld
+#if 0 /* TODO: test this on real hardware */
+ .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
+ .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
+#endif
+ .endif
+
+ \kernel
+
+ subs x2, x2, #64
+ .if \zipped
+ st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+ .else
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+ .endif
+
+2: bge 1b
+ adds x2, x2, #64
+ beq 2f
+
+ /* To handle the tail portion of the data (something less than 64
+ * bytes) load small power-of-two chunks into working registers. It
+ * doesn't matter where they end up in the register; the same process
+ * will store them back out using the same positions and the operations
+ * don't require data to interact with its neighbours.
+ */
+ movi v0.16b, #0
+ movi v1.16b, #0
+ movi v2.16b, #0
+ movi v3.16b, #0
+
+ movi v8.16b, #0
+ movi v9.16b, #0
+ movi v10.16b, #0
+ movi v11.16b, #0
+
+ tbz x2, #5, 1f
+ .if \lddst ; ld1 {v2.16b,v3.16b}, [x0], #32 ; .endif
+ .if \ldsrc ; ld1 {v10.16b,v11.16b}, [x1], #32 ; .endif
+1: tbz x2, #4, 1f
+ .if \lddst ; ld1 {v1.16b}, [x0], #16 ; .endif
+ .if \ldsrc ; ld1 {v9.16b}, [x1], #16 ; .endif
+1: tbz x2, #3, 1f
+ .if \lddst ; ld1 {v0.d}[1], [x0], #8 ; .endif
+ .if \ldsrc ; ld1 {v8.d}[1], [x1], #8 ; .endif
+1: tbz x2, #2, 1f
+ .if \lddst ; ld1 {v0.s}[1], [x0], #4 ; .endif
+ .if \ldsrc ; ld1 {v8.s}[1], [x1], #4 ; .endif
+1: tbz x2, #1, 1f
+ .if \lddst ; ld1 {v0.h}[1], [x0], #2 ; .endif
+ .if \ldsrc ; ld1 {v8.h}[1], [x1], #2 ; .endif
+1: tbz x2, #0, 1f
+ .if \lddst ; ld1 {v0.b}[1], [x0], #1 ; .endif
+ .if \ldsrc ; ld1 {v8.b}[1], [x1], #1 ; .endif
+1:
+ .if \lddst ; sub x0, x0, x2 ; .endif
+
+.if \zipped
+ /* One small impediment in the process above is that some of the load
+ * operations can't perform byte-wise structure deinterleaving at the
+ * same time as loading only part of a register. So the data is loaded
+ * linearly and unpacked manually at this point.
+ */
+ uzp1 v4.16b, v0.16b, v1.16b
+ uzp2 v5.16b, v0.16b, v1.16b
+ uzp1 v6.16b, v2.16b, v3.16b
+ uzp2 v7.16b, v2.16b, v3.16b
+ uzp1 v0.16b, v4.16b, v6.16b
+ uzp2 v2.16b, v4.16b, v6.16b
+ uzp1 v1.16b, v5.16b, v7.16b
+ uzp2 v3.16b, v5.16b, v7.16b
+
+ uzp1 v4.16b, v8.16b, v9.16b
+ uzp2 v5.16b, v8.16b, v9.16b
+ uzp1 v6.16b, v10.16b, v11.16b
+ uzp2 v7.16b, v10.16b, v11.16b
+ uzp1 v8.16b, v4.16b, v6.16b
+ uzp2 v10.16b, v4.16b, v6.16b
+ uzp1 v9.16b, v5.16b, v7.16b
+ uzp2 v11.16b, v5.16b, v7.16b
+
+ \kernel
+
+ zip1 v4.16b, v0.16b, v2.16b
+ zip2 v6.16b, v0.16b, v2.16b
+ zip1 v5.16b, v1.16b, v3.16b
+ zip2 v7.16b, v1.16b, v3.16b
+ zip1 v0.16b, v4.16b, v5.16b
+ zip2 v1.16b, v4.16b, v5.16b
+ zip1 v2.16b, v6.16b, v7.16b
+ zip2 v3.16b, v6.16b, v7.16b
+ .else
+ \kernel
+ .endif
+
+ tbz x2, #5, 1f
+ st1 {v2.16b,v3.16b}, [x0], #32
+1: tbz x2, #4, 1f
+ st1 {v1.16b}, [x0], #16
+1: tbz x2, #3, 1f
+ st1 {v0.d}[1], [x0], #8
+1: tbz x2, #2, 1f
+ st1 {v0.s}[1], [x0], #4
+1: tbz x2, #1, 1f
+ st1 {v0.h}[1], [x0], #2
+1: tbz x2, #0, 2f
+ st1 {v0.b}[1], [x0], #1
+2: ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+.endif
+ mov x0, #0
+ ret
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+/* int rsdIntrinsicBlend_K(
+ * uchar4 *out, // x0
+ * uchar4 const *in, // x1
+ * int slot, // x2
+ * size_t xstart, // x3
+ * size_t xend); // x4
+ */
+ENTRY(rsdIntrinsicBlend_K)
+ adrp x5, blendtable
+ add x5, x5, :lo12:blendtable
+ cmp w2, tablesize
+ bhs 1f
+ ldrsh x6, [x5, w2, uxtw #1]
+ add x0, x0, w3, uxtw #2
+ add x1, x1, w3, uxtw #2
+ sub w2, w4, w3
+ ubfiz x2, x2, #2, #32 /* TODO: fix */
+ cbz x6, 1f
+ adr x5, 2f
+ add x6, x5, x6
+2: br x6
+1: mov x0, #-1
+ ret
+
+END(rsdIntrinsicBlend_K)
+
+.rodata
+.set off,0
+blendtable:
+#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
+ BLEND_LIST(BLEND_X)
+#undef BLEND_X