1 files changed, 622 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Blend_advsimd.S b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
new file mode 100644
index 0000000..e5cb29b
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+#define BLEND_LIST(X) \
+    X(0, CLEAR) \
+    X(1, SRC) \
+    X(2, DST) \
+    X(3, SRC_OVER) \
+    X(4, DST_OVER) \
+    X(5, SRC_IN) \
+    X(6, DST_IN) \
+    X(7, SRC_OUT) \
+    X(8, DST_OUT) \
+    X(9, SRC_ATOP) \
+    X(10, DST_ATOP) \
+    X(11, XOR) \
+    X(12, MULTIPLY) \
+    X(13, ADD) \
+    X(14, SUBTRACT)
+
+/* This operation was not enabled in the original RenderScript. We could
+ * enable it.
+ *
+ *  X(15, DIFFERENCE) \
+ */
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component.  The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer.  Both have already been split out
+ * into one colour component per register (if necessary).  q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ *    zipped=0 -- The macro does not require the RGBA components to be
+ *                separated.
+ *    lddst=0  -- The macro does not require data from the destination buffer.
+ *    ldsrc=0  -- The macro does not require data from the source buffer.
+ *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ *                inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+        mov     v0.16b, v8.16b
+        mov     v1.16b, v9.16b
+        mov     v2.16b, v10.16b
+        mov     v3.16b, v11.16b
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+        /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+        mvn         v7.16b, v11.16b
+
+        umull2      v12.8h, v7.16b, v0.16b
+        umull       v0.8h,  v7.8b,  v0.8b
+        umull2      v13.8h, v7.16b, v1.16b
+        umull       v1.8h,  v7.8b,  v1.8b
+        umull2      v14.8h, v7.16b, v2.16b
+        umull       v2.8h,  v7.8b,  v2.8b
+        umull2      v15.8h, v7.16b, v3.16b
+        umull       v3.8h,  v7.8b,  v3.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+        mvn         v7.16b, v3.16b
+
+        umull2      v12.8h, v7.16b, v8.16b
+        umull       v8.8h,  v7.8b,  v8.8b
+        umull2      v13.8h, v7.16b, v9.16b
+        umull       v9.8h,  v7.8b,  v9.8b
+        umull2      v14.8h, v7.16b, v10.16b
+        umull       v10.8h, v7.8b,  v10.8b
+        umull2      v15.8h, v7.16b, v11.16b
+        umull       v11.8h, v7.8b,  v11.8b
+
+        rshrn       v4.8b,  v8.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v9.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v10.8h, #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v11.8h, #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v8.8h,  v8.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v9.8h,  v9.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v10.8h, v10.8h, v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v11.8h, v11.8h, v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v8.8b,  v8.8h,  #8
+        rshrn2      v8.16b, v12.8h, #8
+        rshrn       v9.8b,  v9.8h,  #8
+        rshrn2      v9.16b, v13.8h, #8
+        rshrn       v10.8b,  v10.8h, #8
+        rshrn2      v10.16b, v14.8h, #8
+        rshrn       v11.8b,  v11.8h, #8
+        rshrn2      v11.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+        umull2      v12.8h, v3.16b, v8.16b
+        umull       v0.8h,  v3.8b,  v8.8b
+        umull2      v13.8h, v3.16b, v9.16b
+        umull       v1.8h,  v3.8b,  v9.8b
+        umull2      v14.8h, v3.16b, v10.16b
+        umull       v2.8h,  v3.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+        umull2      v12.8h, v0.16b, v11.16b
+        umull       v0.8h,  v0.8b,  v11.8b
+        umull2      v13.8h, v1.16b, v11.16b
+        umull       v1.8h,  v1.8b,  v11.8b
+        umull2      v14.8h, v2.16b, v11.16b
+        umull       v2.8h,  v2.8b,  v11.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+        mvn         v3.16b, v3.16b
+        blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+        mvn         v11.16b, v11.16b
+        blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+        mvn         v11.16b, v11.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+        mvn         v3.16b, v3.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+
+        mov         v3.16b, v11.16b
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+        umull2      v12.8h, v0.16b, v8.16b
+        umull       v0.8h,  v0.8b,  v8.8b
+        umull2      v13.8h, v1.16b, v9.16b
+        umull       v1.8h,  v1.8b,  v9.8b
+        umull2      v14.8h, v2.16b, v10.16b
+        umull       v2.8h,  v2.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+        uqadd    v0.16b, v0.16b, v8.16b
+        uqadd    v1.16b, v1.16b, v9.16b
+        uqadd    v2.16b, v2.16b, v10.16b
+        uqadd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+        uqsub    v0.16b, v0.16b, v8.16b
+        uqsub    v1.16b, v1.16b, v9.16b
+        uqsub    v2.16b, v2.16b, v10.16b
+        uqsub    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+        uabd    v0.16b, v0.16b, v8.16b
+        uabd    v1.16b, v1.16b, v9.16b
+        uabd    v2.16b, v2.16b, v10.16b
+        uabd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+        eor     v0.16b, v0.16b, v8.16b
+        eor     v1.16b, v1.16b, v9.16b
+        eor     v2.16b, v2.16b, v10.16b
+        eor     v3.16b, v3.16b, v11.16b
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+        \kernel
+.else
+        sub     x3, sp, #32
+        sub     sp, sp, #64
+        st1     {v8.1d - v11.1d}, [sp]
+        st1     {v12.1d - v15.1d}, [x3]
+        subs    x2, x2, #64
+        b       2f
+.align 4
+1:
+  .if \lddst
+    .if \zipped
+        ld4     {v0.16b - v3.16b}, [x0]
+    .else
+        ld1     {v0.16b - v3.16b}, [x0]
+    .endif
+  .endif
+  .if \ldsrc
+    .if \zipped
+        ld4     {v8.16b - v11.16b}, [x1], #64
+    .else
+        ld1     {v8.16b - v11.16b}, [x1], #64
+    .endif
+  .endif
+  .if \pld
+#if 0 /* TODO: test this on real hardware */
+    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
+    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
+#endif
+  .endif
+
+        \kernel
+
+        subs    x2, x2, #64
+  .if \zipped
+        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .else
+        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .endif
+
+2:      bge     1b
+        adds    x2, x2, #64
+        beq     2f
+
+        /* To handle the tail portion of the data (something less than 64
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the operations
+         * don't require data to interact with its neighbours.
+         */
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+
+        movi    v8.16b, #0
+        movi    v9.16b, #0
+        movi    v10.16b, #0
+        movi    v11.16b, #0
+
+        tbz     x2, #5, 1f
+  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
+  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
+1:      tbz     x2, #4, 1f
+  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
+  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
+1:      tbz     x2, #3, 1f
+  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
+  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
+1:      tbz     x2, #2, 1f
+  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
+  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
+1:      tbz     x2, #1, 1f
+  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
+  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
+1:      tbz     x2, #0, 1f
+  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
+  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
+1:
+  .if \lddst ; sub     x0, x0, x2           ; .endif
+
+.if \zipped
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point.
+         */
+        uzp1    v4.16b, v0.16b, v1.16b
+        uzp2    v5.16b, v0.16b, v1.16b
+        uzp1    v6.16b, v2.16b, v3.16b
+        uzp2    v7.16b, v2.16b, v3.16b
+        uzp1    v0.16b, v4.16b, v6.16b
+        uzp2    v2.16b, v4.16b, v6.16b
+        uzp1    v1.16b, v5.16b, v7.16b
+        uzp2    v3.16b, v5.16b, v7.16b
+
+        uzp1    v4.16b, v8.16b, v9.16b
+        uzp2    v5.16b, v8.16b, v9.16b
+        uzp1    v6.16b, v10.16b, v11.16b
+        uzp2    v7.16b, v10.16b, v11.16b
+        uzp1    v8.16b, v4.16b, v6.16b
+        uzp2    v10.16b, v4.16b, v6.16b
+        uzp1    v9.16b, v5.16b, v7.16b
+        uzp2    v11.16b, v5.16b, v7.16b
+
+        \kernel
+
+        zip1    v4.16b, v0.16b, v2.16b
+        zip2    v6.16b, v0.16b, v2.16b
+        zip1    v5.16b, v1.16b, v3.16b
+        zip2    v7.16b, v1.16b, v3.16b
+        zip1    v0.16b, v4.16b, v5.16b
+        zip2    v1.16b, v4.16b, v5.16b
+        zip1    v2.16b, v6.16b, v7.16b
+        zip2    v3.16b, v6.16b, v7.16b
+  .else
+        \kernel
+  .endif
+
+        tbz     x2, #5, 1f
+        st1     {v2.16b,v3.16b}, [x0], #32
+1:      tbz     x2, #4, 1f
+        st1     {v1.16b}, [x0], #16
+1:      tbz     x2, #3, 1f
+        st1     {v0.d}[1], [x0], #8
+1:      tbz     x2, #2, 1f
+        st1     {v0.s}[1], [x0], #4
+1:      tbz     x2, #1, 1f
+        st1     {v0.h}[1], [x0], #2
+1:      tbz     x2, #0, 2f
+        st1     {v0.b}[1], [x0], #1
+2:      ld1     {v8.1d - v11.1d}, [sp], #32
+        ld1     {v12.1d - v15.1d}, [sp], #32
+.endif
+        mov     x0, #0
+        ret
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+/*  int rsdIntrinsicBlend_K(
+ *          uchar4 *out,        // x0
+ *          uchar4 const *in,   // x1
+ *          int slot,           // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicBlend_K)
+    adrp    x5, blendtable
+    add     x5, x5, :lo12:blendtable
+    cmp     w2, tablesize
+    bhs     1f
+    ldrsh   x6, [x5, w2, uxtw #1]
+    add     x0, x0, w3, uxtw #2
+    add     x1, x1, w3, uxtw #2
+    sub     w2, w4, w3
+    ubfiz   x2, x2, #2, #32 /* TODO: fix */
+    cbz     x6, 1f
+    adr     x5, 2f
+    add     x6, x5, x6
+2:  br      x6
+1:  mov     x0, #-1
+    ret
+
+END(rsdIntrinsicBlend_K)
+
+.rodata
+.set off,0
+blendtable:
+#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
+        BLEND_LIST(BLEND_X)
+#undef BLEND_X