33 files changed, 17208 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Blend.cpp b/renderscript-toolkit/src/main/cpp/Blend.cpp
new file mode 100644
index 0000000..6689756
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blend.cpp
@@ -0,0 +1,367 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blend"
+
+/**
+ * Blends a source into a destination, based on the mode.
+ */
+class BlendTask : public Task {
+    // The type of blending to do.
+    RenderScriptToolkit::BlendingMode mMode;
+    // The input we're blending.
+    const uchar4* mIn;
+    // The destination, used both for input and output.
+    uchar4* mOut;
+
+    void blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+               uint32_t length);
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    BlendTask(RenderScriptToolkit::BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+              size_t sizeY, const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mMode{mode},
+          mIn{reinterpret_cast<const uchar4*>(in)},
+          mOut{reinterpret_cast<uchar4*>(out)} {}
+};
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+extern "C" int rsdIntrinsicBlend_K(uchar4 *out, uchar4 const *in, int slot,
+                    uint32_t xstart, uint32_t xend);
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8);
+extern void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8);
+#endif
+
+// Convert vector to uchar4, clipping each value to 255.
+template <typename TI>
+static inline uchar4 convertClipped(TI amount) {
+    return uchar4 { static_cast<uchar>(amount.x > 255 ? 255 : amount.x),
+                    static_cast<uchar>(amount.y > 255 ? 255 : amount.y),
+                    static_cast<uchar>(amount.z > 255 ? 255 : amount.z),
+                    static_cast<uchar>(amount.w > 255 ? 255 : amount.w)};
+}
+
+void BlendTask::blend(RenderScriptToolkit::BlendingMode mode, const uchar4* in, uchar4* out,
+                      uint32_t length) {
+    uint32_t x1 = 0;
+    uint32_t x2 = length;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd) {
+        if (rsdIntrinsicBlend_K(out, in, (int) mode, x1, x2) >= 0) {
+            return;
+        } else {
+            ALOGW("Intrinsic Blend failed to use SIMD for %d", mode);
+        }
+    }
+#endif
+    switch (mode) {
+    case RenderScriptToolkit::BlendingMode::CLEAR:
+        for (;x1 < x2; x1++, out++) {
+            *out = 0;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC:
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = *in;
+        }
+        break;
+    //RenderScriptToolkit::BlendingMode::DST is a NOP
+    case RenderScriptToolkit::BlendingMode::DST:
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_OVER:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            ushort4 out_s = convert<ushort4>(*out);
+            in_s = in_s + ((out_s * (ushort4)(255 - in_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_OVER:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOver_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            ushort4 out_s = convert<ushort4>(*out);
+            in_s = out_s + ((in_s * (ushort4)(255 - out_s.w)) >> (ushort4)8);
+            *out = convertClipped(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_IN:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+#endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            in_s = (in_s * out->w) >> (ushort4)8;
+            *out = convert<uchar4>(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_IN:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstIn_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 out_s = convert<ushort4>(*out);
+            out_s = (out_s * in->w) >> (ushort4)8;
+            *out = convert<uchar4>(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_OUT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 in_s = convert<ushort4>(*in);
+            in_s = (in_s * (ushort4)(255 - out->w)) >> (ushort4)8;
+            *out = convert<uchar4>(in_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_OUT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstOut_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            ushort4 out_s = convert<ushort4>(*out);
+            out_s = (out_s * (ushort4)(255 - in->w)) >> (ushort4)8;
+            *out = convert<uchar4>(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SRC_ATOP:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSrcAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            // The max value the operation could produce before the shift
+            // is 255 * 255 + 255 * (255 - 0) = 130050, or 0x1FC02.
+            // That value does not fit in a ushort, so we use uint.
+            uint4 in_s = convert<uint4>(*in);
+            uint4 out_s = convert<uint4>(*out);
+            out_s.xyz = ((in_s.xyz * out_s.w) +
+              (out_s.xyz * ((uint3)255 - (uint3)in_s.w))) >> (uint3)8;
+            *out = convertClipped(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::DST_ATOP:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendDstAtop_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+     #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            uint4 in_s = convert<uint4>(*in);
+            uint4 out_s = convert<uint4>(*out);
+            out_s.xyz = ((out_s.xyz * in_s.w) +
+              (in_s.xyz * ((uint3)255 - (uint3)out_s.w))) >> (uint3)8;
+            out_s.w = in_s.w;
+            *out = convertClipped(out_s);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::XOR:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendXor_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            *out = *in ^ *out;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::MULTIPLY:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if ((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendMultiply_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+          *out = convert<uchar4>((convert<ushort4>(*in) * convert<ushort4>(*out))
+                                >> (ushort4)8);
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::ADD:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendAdd_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            uint32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR + iR) > 255 ? 255 : oR + iR;
+            out->y = (oG + iG) > 255 ? 255 : oG + iG;
+            out->z = (oB + iB) > 255 ? 255 : oB + iB;
+            out->w = (oA + iA) > 255 ? 255 : oA + iA;
+        }
+        break;
+    case RenderScriptToolkit::BlendingMode::SUBTRACT:
+    #if defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            if((x1 + 8) < x2) {
+                uint32_t len = (x2 - x1) >> 3;
+                rsdIntrinsicBlendSub_K(out, in, len);
+                x1 += len << 3;
+                out += len << 3;
+                in += len << 3;
+            }
+        }
+    #endif
+        for (;x1 < x2; x1++, out++, in++) {
+            int32_t iR = in->x, iG = in->y, iB = in->z, iA = in->w,
+                oR = out->x, oG = out->y, oB = out->z, oA = out->w;
+            out->x = (oR - iR) < 0 ? 0 : oR - iR;
+            out->y = (oG - iG) < 0 ? 0 : oG - iG;
+            out->z = (oB - iB) < 0 ? 0 : oB - iB;
+            out->w = (oA - iA) < 0 ? 0 : oA - iA;
+        }
+        break;
+
+    default:
+        ALOGE("Called unimplemented value %d", mode);
+        assert(false);
+    }
+}
+
+void BlendTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                            size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = y * mSizeX + startX;
+        blend(mMode, mIn + offset, mOut + offset, endX - startX);
+    }
+}
+
+void RenderScriptToolkit::blend(BlendingMode mode, const uint8_t* in, uint8_t* out, size_t sizeX,
+                                size_t sizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    BlendTask task(mode, in, out, sizeX, sizeY, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace google::android::renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Blend_advsimd.S b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
new file mode 100644
index 0000000..e5cb29b
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blend_advsimd.S
@@ -0,0 +1,622 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+#define BLEND_LIST(X) \
+    X(0, CLEAR) \
+    X(1, SRC) \
+    X(2, DST) \
+    X(3, SRC_OVER) \
+    X(4, DST_OVER) \
+    X(5, SRC_IN) \
+    X(6, DST_IN) \
+    X(7, SRC_OUT) \
+    X(8, DST_OUT) \
+    X(9, SRC_ATOP) \
+    X(10, DST_ATOP) \
+    X(11, XOR) \
+    X(12, MULTIPLY) \
+    X(13, ADD) \
+    X(14, SUBTRACT)
+
+/* This operation was not enabled in the original RenderScript. We could
+ * enable it.
+ *
+ *  X(15, DIFFERENCE) \
+ */
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component.  The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer.  Both have already been split out
+ * into one colour component per register (if necessary).  q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ *    zipped=0 -- The macro does not require the RGBA components to be
+ *                separated.
+ *    lddst=0  -- The macro does not require data from the destination buffer.
+ *    ldsrc=0  -- The macro does not require data from the source buffer.
+ *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ *                inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+        mov     v0.16b, v8.16b
+        mov     v1.16b, v9.16b
+        mov     v2.16b, v10.16b
+        mov     v3.16b, v11.16b
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+        /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+        mvn         v7.16b, v11.16b
+
+        umull2      v12.8h, v7.16b, v0.16b
+        umull       v0.8h,  v7.8b,  v0.8b
+        umull2      v13.8h, v7.16b, v1.16b
+        umull       v1.8h,  v7.8b,  v1.8b
+        umull2      v14.8h, v7.16b, v2.16b
+        umull       v2.8h,  v7.8b,  v2.8b
+        umull2      v15.8h, v7.16b, v3.16b
+        umull       v3.8h,  v7.8b,  v3.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+        mvn         v7.16b, v3.16b
+
+        umull2      v12.8h, v7.16b, v8.16b
+        umull       v8.8h,  v7.8b,  v8.8b
+        umull2      v13.8h, v7.16b, v9.16b
+        umull       v9.8h,  v7.8b,  v9.8b
+        umull2      v14.8h, v7.16b, v10.16b
+        umull       v10.8h, v7.8b,  v10.8b
+        umull2      v15.8h, v7.16b, v11.16b
+        umull       v11.8h, v7.8b,  v11.8b
+
+        rshrn       v4.8b,  v8.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v9.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v10.8h, #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v11.8h, #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v8.8h,  v8.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v9.8h,  v9.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v10.8h, v10.8h, v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v11.8h, v11.8h, v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v8.8b,  v8.8h,  #8
+        rshrn2      v8.16b, v12.8h, #8
+        rshrn       v9.8b,  v9.8h,  #8
+        rshrn2      v9.16b, v13.8h, #8
+        rshrn       v10.8b,  v10.8h, #8
+        rshrn2      v10.16b, v14.8h, #8
+        rshrn       v11.8b,  v11.8h, #8
+        rshrn2      v11.16b, v15.8h, #8
+
+        uqadd       v0.16b, v0.16b, v8.16b
+        uqadd       v1.16b, v1.16b, v9.16b
+        uqadd       v2.16b, v2.16b, v10.16b
+        uqadd       v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+        umull2      v12.8h, v3.16b, v8.16b
+        umull       v0.8h,  v3.8b,  v8.8b
+        umull2      v13.8h, v3.16b, v9.16b
+        umull       v1.8h,  v3.8b,  v9.8b
+        umull2      v14.8h, v3.16b, v10.16b
+        umull       v2.8h,  v3.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+        umull2      v12.8h, v0.16b, v11.16b
+        umull       v0.8h,  v0.8b,  v11.8b
+        umull2      v13.8h, v1.16b, v11.16b
+        umull       v1.8h,  v1.8b,  v11.8b
+        umull2      v14.8h, v2.16b, v11.16b
+        umull       v2.8h,  v2.8b,  v11.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+        mvn         v3.16b, v3.16b
+        blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+        mvn         v11.16b, v11.16b
+        blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+        mvn         v11.16b, v11.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+        mvn         v3.16b, v3.16b
+
+        umull2      v12.8h, v11.16b, v0.16b
+        umull       v0.8h,  v11.8b,  v0.8b
+        umull2      v13.8h, v11.16b, v1.16b
+        umull       v1.8h,  v11.8b,  v1.8b
+        umull2      v14.8h, v11.16b, v2.16b
+        umull       v2.8h,  v11.8b,  v2.8b
+
+        umull2      v4.8h,  v3.16b, v8.16b
+        umull       v8.8h,  v3.8b,  v8.8b
+        umull2      v5.8h,  v3.16b, v9.16b
+        umull       v9.8h,  v3.8b,  v9.8b
+        umull2      v6.8h,  v3.16b, v10.16b
+        umull       v10.8h, v3.8b,  v10.8b
+
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+
+        urshr       v8.8h,  v0.8h,  #8
+        urshr       v4.8h,  v12.8h, #8
+        urshr       v9.8h,  v1.8h,  #8
+        urshr       v5.8h,  v13.8h, #8
+        urshr       v10.8h, v2.8h,  #8
+        urshr       v6.8h,  v14.8h, #8
+
+        uqadd       v0.8h,  v0.8h,  v8.8h
+        uqadd       v12.8h, v12.8h, v4.8h
+        uqadd       v1.8h,  v1.8h,  v9.8h
+        uqadd       v13.8h, v13.8h, v5.8h
+        uqadd       v2.8h,  v2.8h,  v10.8h
+        uqadd       v14.8h, v14.8h, v6.8h
+
+        uqrshrn     v0.8b,  v0.8h,  #8
+        uqrshrn2    v0.16b, v12.8h, #8
+        uqrshrn     v1.8b,  v1.8h,  #8
+        uqrshrn2    v1.16b, v13.8h, #8
+        uqrshrn     v2.8b,  v2.8h,  #8
+        uqrshrn2    v2.16b, v14.8h, #8
+
+        mov         v3.16b, v11.16b
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+        umull2      v12.8h, v0.16b, v8.16b
+        umull       v0.8h,  v0.8b,  v8.8b
+        umull2      v13.8h, v1.16b, v9.16b
+        umull       v1.8h,  v1.8b,  v9.8b
+        umull2      v14.8h, v2.16b, v10.16b
+        umull       v2.8h,  v2.8b,  v10.8b
+        umull2      v15.8h, v3.16b, v11.16b
+        umull       v3.8h,  v3.8b,  v11.8b
+
+        rshrn       v4.8b,  v0.8h,  #8
+        rshrn2      v4.16b, v12.8h, #8
+        rshrn       v5.8b,  v1.8h,  #8
+        rshrn2      v5.16b, v13.8h, #8
+        rshrn       v6.8b,  v2.8h,  #8
+        rshrn2      v6.16b, v14.8h, #8
+        rshrn       v7.8b,  v3.8h,  #8
+        rshrn2      v7.16b, v15.8h, #8
+
+        uaddw       v0.8h,  v0.8h,  v4.8b
+        uaddw2      v12.8h, v12.8h, v4.16b
+        uaddw       v1.8h,  v1.8h,  v5.8b
+        uaddw2      v13.8h, v13.8h, v5.16b
+        uaddw       v2.8h,  v2.8h,  v6.8b
+        uaddw2      v14.8h, v14.8h, v6.16b
+        uaddw       v3.8h,  v3.8h,  v7.8b
+        uaddw2      v15.8h, v15.8h, v7.16b
+
+        rshrn       v0.8b,  v0.8h,  #8
+        rshrn2      v0.16b, v12.8h, #8
+        rshrn       v1.8b,  v1.8h,  #8
+        rshrn2      v1.16b, v13.8h, #8
+        rshrn       v2.8b,  v2.8h,  #8
+        rshrn2      v2.16b, v14.8h, #8
+        rshrn       v3.8b,  v3.8h,  #8
+        rshrn2      v3.16b, v15.8h, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+        uqadd    v0.16b, v0.16b, v8.16b
+        uqadd    v1.16b, v1.16b, v9.16b
+        uqadd    v2.16b, v2.16b, v10.16b
+        uqadd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+        uqsub    v0.16b, v0.16b, v8.16b
+        uqsub    v1.16b, v1.16b, v9.16b
+        uqsub    v2.16b, v2.16b, v10.16b
+        uqsub    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+        uabd    v0.16b, v0.16b, v8.16b
+        uabd    v1.16b, v1.16b, v9.16b
+        uabd    v2.16b, v2.16b, v10.16b
+        uabd    v3.16b, v3.16b, v11.16b
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+        eor     v0.16b, v0.16b, v8.16b
+        eor     v1.16b, v1.16b, v9.16b
+        eor     v2.16b, v2.16b, v10.16b
+        eor     v3.16b, v3.16b, v11.16b
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+        \kernel
+.else
+        sub     x3, sp, #32
+        sub     sp, sp, #64
+        st1     {v8.1d - v11.1d}, [sp]
+        st1     {v12.1d - v15.1d}, [x3]
+        subs    x2, x2, #64
+        b       2f
+.align 4
+1:
+  .if \lddst
+    .if \zipped
+        ld4     {v0.16b - v3.16b}, [x0]
+    .else
+        ld1     {v0.16b - v3.16b}, [x0]
+    .endif
+  .endif
+  .if \ldsrc
+    .if \zipped
+        ld4     {v8.16b - v11.16b}, [x1], #64
+    .else
+        ld1     {v8.16b - v11.16b}, [x1], #64
+    .endif
+  .endif
+  .if \pld
+#if 0 /* TODO: test this on real hardware */
+    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
+    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
+#endif
+  .endif
+
+        \kernel
+
+        subs    x2, x2, #64
+  .if \zipped
+        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .else
+        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+  .endif
+
+2:      bge     1b
+        adds    x2, x2, #64
+        beq     2f
+
+        /* To handle the tail portion of the data (something less than 64
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the operations
+         * don't require data to interact with its neighbours.
+         */
+        movi    v0.16b, #0
+        movi    v1.16b, #0
+        movi    v2.16b, #0
+        movi    v3.16b, #0
+
+        movi    v8.16b, #0
+        movi    v9.16b, #0
+        movi    v10.16b, #0
+        movi    v11.16b, #0
+
+        tbz     x2, #5, 1f
+  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
+  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
+1:      tbz     x2, #4, 1f
+  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
+  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
+1:      tbz     x2, #3, 1f
+  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
+  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
+1:      tbz     x2, #2, 1f
+  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
+  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
+1:      tbz     x2, #1, 1f
+  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
+  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
+1:      tbz     x2, #0, 1f
+  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
+  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
+1:
+  .if \lddst ; sub     x0, x0, x2           ; .endif
+
+.if \zipped
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point.
+         */
+        uzp1    v4.16b, v0.16b, v1.16b
+        uzp2    v5.16b, v0.16b, v1.16b
+        uzp1    v6.16b, v2.16b, v3.16b
+        uzp2    v7.16b, v2.16b, v3.16b
+        uzp1    v0.16b, v4.16b, v6.16b
+        uzp2    v2.16b, v4.16b, v6.16b
+        uzp1    v1.16b, v5.16b, v7.16b
+        uzp2    v3.16b, v5.16b, v7.16b
+
+        uzp1    v4.16b, v8.16b, v9.16b
+        uzp2    v5.16b, v8.16b, v9.16b
+        uzp1    v6.16b, v10.16b, v11.16b
+        uzp2    v7.16b, v10.16b, v11.16b
+        uzp1    v8.16b, v4.16b, v6.16b
+        uzp2    v10.16b, v4.16b, v6.16b
+        uzp1    v9.16b, v5.16b, v7.16b
+        uzp2    v11.16b, v5.16b, v7.16b
+
+        \kernel
+
+        zip1    v4.16b, v0.16b, v2.16b
+        zip2    v6.16b, v0.16b, v2.16b
+        zip1    v5.16b, v1.16b, v3.16b
+        zip2    v7.16b, v1.16b, v3.16b
+        zip1    v0.16b, v4.16b, v5.16b
+        zip2    v1.16b, v4.16b, v5.16b
+        zip1    v2.16b, v6.16b, v7.16b
+        zip2    v3.16b, v6.16b, v7.16b
+  .else
+        \kernel
+  .endif
+
+        tbz     x2, #5, 1f
+        st1     {v2.16b,v3.16b}, [x0], #32
+1:      tbz     x2, #4, 1f
+        st1     {v1.16b}, [x0], #16
+1:      tbz     x2, #3, 1f
+        st1     {v0.d}[1], [x0], #8
+1:      tbz     x2, #2, 1f
+        st1     {v0.s}[1], [x0], #4
+1:      tbz     x2, #1, 1f
+        st1     {v0.h}[1], [x0], #2
+1:      tbz     x2, #0, 2f
+        st1     {v0.b}[1], [x0], #1
+2:      ld1     {v8.1d - v11.1d}, [sp], #32
+        ld1     {v12.1d - v15.1d}, [sp], #32
+.endif
+        mov     x0, #0
+        ret
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+#define BLEND_X(d, n) .set tablesize, d+1 ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+/*  int rsdIntrinsicBlend_K(
+ *          uchar4 *out,        // x0
+ *          uchar4 const *in,   // x1
+ *          int slot,           // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicBlend_K)
+    adrp    x5, blendtable
+    add     x5, x5, :lo12:blendtable
+    cmp     w2, tablesize
+    bhs     1f
+    ldrsh   x6, [x5, w2, uxtw #1]
+    add     x0, x0, w3, uxtw #2
+    add     x1, x1, w3, uxtw #2
+    sub     w2, w4, w3
+    ubfiz   x2, x2, #2, #32 /* TODO: fix */
+    cbz     x6, 1f
+    adr     x5, 2f
+    add     x6, x5, x6
+2:  br      x6
+1:  mov     x0, #-1
+    ret
+
+END(rsdIntrinsicBlend_K)
+
+.rodata
+.set off,0
+blendtable:
+#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
+        BLEND_LIST(BLEND_X)
+#undef BLEND_X
diff --git a/renderscript-toolkit/src/main/cpp/Blend_neon.S b/renderscript-toolkit/src/main/cpp/Blend_neon.S
new file mode 100644
index 0000000..a1fa1b5
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blend_neon.S
@@ -0,0 +1,617 @@
+/*
+ * Copyright (C) 2013-2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define BLEND_LIST(X) \
+    X(0, CLEAR) \
+    X(1, SRC) \
+    X(2, DST) \
+    X(3, SRC_OVER) \
+    X(4, DST_OVER) \
+    X(5, SRC_IN) \
+    X(6, DST_IN) \
+    X(7, SRC_OUT) \
+    X(8, DST_OUT) \
+    X(9, SRC_ATOP) \
+    X(10, DST_ATOP) \
+    X(11, XOR) \
+    X(14, MULTIPLY) \
+    X(21, DIFFERENCE) \
+    X(34, ADD) \
+    X(35, SUBTRACT)
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* For every blend operation supported, define a macro with just the arithmetic
+ * component.  The rest can be handled later on.
+ *
+ * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
+ * contain the data from the source buffer.  Both have already been split out
+ * into one colour component per register (if necessary).  q3 and q11 contain
+ * the alpha components.
+ *
+ * At the same time as defining the assembly macro, define a corresponding
+ * preprocessor macro indicating any other requirements.
+ *    zipped=0 -- The macro does not require the RGBA components to be
+ *                separated.
+ *    lddst=0  -- The macro does not require data from the destination buffer.
+ *    ldsrc=0  -- The macro does not require data from the source buffer.
+ *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
+ *                inserted without any surrounding load/store or loop code.
+ */
+
+#define params_CLEAR zipped=0, lddst=0, ldsrc=0
+.macro blend_kernel_CLEAR
+        vmov.i8 q0, #0
+        vmov.i8 q1, #0
+        vmov.i8 q2, #0
+        vmov.i8 q3, #0
+.endm
+
+#define params_SRC zipped=0, lddst=0
+.macro blend_kernel_SRC
+        vmov    q0, q8
+        vmov    q1, q9
+        vmov    q2, q10
+        vmov    q3, q11
+.endm
+
+#define params_DST nowrap=1
+.macro blend_kernel_DST
+        /* nop */
+.endm
+
+#define params_SRC_OVER zipped=1
+.macro blend_kernel_SRC_OVER
+        vmvn        q7, q11
+
+        vmull.u8    q12, d15, d1
+        vmull.u8    q0,  d14, d0
+        vmull.u8    q13, d15, d3
+        vmull.u8    q1,  d14, d2
+        vmull.u8    q14, d15, d5
+        vmull.u8    q2,  d14, d4
+        vmull.u8    q15, d15, d7
+        vmull.u8    q3,  d14, d6
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+
+        vqadd.u8    q0, q8
+        vqadd.u8    q1, q9
+        vqadd.u8    q2, q10
+        vqadd.u8    q3, q11
+.endm
+
+#define params_DST_OVER zipped=1
+.macro blend_kernel_DST_OVER
+        vmvn        q7, q3
+
+        vmull.u8    q12, d15, d17
+        vmull.u8    q8,  d14, d16
+        vmull.u8    q13, d15, d19
+        vmull.u8    q9,  d14, d18
+        vmull.u8    q14, d15, d21
+        vmull.u8    q10, d14, d20
+        vmull.u8    q15, d15, d23
+        vmull.u8    q11, d14, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q8,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q9,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q10, d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q11, d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d16, q8,  #8
+        vrshrn.u16  d17, q12, #8
+        vrshrn.u16  d18, q9,  #8
+        vrshrn.u16  d19, q13, #8
+        vrshrn.u16  d20, q10, #8
+        vrshrn.u16  d21, q14, #8
+        vrshrn.u16  d22, q11, #8
+        vrshrn.u16  d23, q15, #8
+
+        vqadd.u8    q0, q8
+        vqadd.u8    q1, q9
+        vqadd.u8    q2, q10
+        vqadd.u8    q3, q11
+.endm
+
+#define params_SRC_IN zipped=1
+.macro blend_kernel_SRC_IN
+        vmull.u8    q12, d7, d17
+        vmull.u8    q0,  d6, d16
+        vmull.u8    q13, d7, d19
+        vmull.u8    q1,  d6, d18
+        vmull.u8    q14, d7, d21
+        vmull.u8    q2,  d6, d20
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_DST_IN zipped=1
+.macro blend_kernel_DST_IN
+        vmull.u8    q12, d1, d23
+        vmull.u8    q0,  d0, d22
+        vmull.u8    q13, d3, d23
+        vmull.u8    q1,  d2, d22
+        vmull.u8    q14, d5, d23
+        vmull.u8    q2,  d4, d22
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_SRC_OUT zipped=1
+.macro blend_kernel_SRC_OUT
+        vmvn        q3, q3
+        blend_kernel_SRC_IN
+.endm
+
+
+#define params_DST_OUT zipped=1
+.macro blend_kernel_DST_OUT
+        vmvn        q11, q11
+        blend_kernel_DST_IN
+.endm
+
+#define params_SRC_ATOP zipped=1
+.macro blend_kernel_SRC_ATOP
+        vmvn        q11, q11
+
+        vmull.u8    q12, d23, d1
+        vmull.u8    q0,  d22, d0
+        vmull.u8    q13, d23, d3
+        vmull.u8    q1,  d22, d2
+        vmull.u8    q14, d23, d5
+        vmull.u8    q2,  d22, d4
+
+        vmull.u8    q4,  d7, d17
+        vmull.u8    q8,  d6, d16
+        vmull.u8    q5,  d7, d19
+        vmull.u8    q9,  d6, d18
+        vmull.u8    q6,  d7, d21
+        vmull.u8    q10, d6, d20
+
+        vqadd.u16   q12, q4
+        vqadd.u16   q0,  q8
+        vqadd.u16   q13, q5
+        vqadd.u16   q1,  q9
+        vqadd.u16   q14, q6
+        vqadd.u16   q2,  q10
+
+        vrshr.u16   q8,  q0,  #8
+        vrshr.u16   q4,  q12, #8
+        vrshr.u16   q9,  q1,  #8
+        vrshr.u16   q5,  q13, #8
+        vrshr.u16   q10, q2,  #8
+        vrshr.u16   q6,  q14, #8
+
+        vqadd.u16   q0,  q8
+        vqadd.u16   q12, q4
+        vqadd.u16   q1,  q9
+        vqadd.u16   q13, q5
+        vqadd.u16   q2,  q10
+        vqadd.u16   q14, q6
+
+        vqrshrn.u16 d0, q0,  #8
+        vqrshrn.u16 d1, q12, #8
+        vqrshrn.u16 d2, q1,  #8
+        vqrshrn.u16 d3, q13, #8
+        vqrshrn.u16 d4, q2,  #8
+        vqrshrn.u16 d5, q14, #8
+.endm
+
+#define params_DST_ATOP zipped=1
+.macro blend_kernel_DST_ATOP
+        vmvn        q3, q3
+
+        vmull.u8    q12, d23, d1
+        vmull.u8    q0,  d22, d0
+        vmull.u8    q13, d23, d3
+        vmull.u8    q1,  d22, d2
+        vmull.u8    q14, d23, d5
+        vmull.u8    q2,  d22, d4
+
+        vmull.u8    q4,  d7, d17
+        vmull.u8    q8,  d6, d16
+        vmull.u8    q5,  d7, d19
+        vmull.u8    q9,  d6, d18
+        vmull.u8    q6,  d7, d21
+        vmull.u8    q10, d6, d20
+
+        vqadd.u16   q12, q4
+        vqadd.u16   q0,  q8
+        vqadd.u16   q13, q5
+        vqadd.u16   q1,  q9
+        vqadd.u16   q14, q6
+        vqadd.u16   q2,  q10
+
+        vrshr.u16   q8,  q0,  #8
+        vrshr.u16   q4,  q12, #8
+        vrshr.u16   q9,  q1,  #8
+        vrshr.u16   q5,  q13, #8
+        vrshr.u16   q10, q2,  #8
+        vrshr.u16   q6,  q14, #8
+
+        vqadd.u16   q0,  q8
+        vqadd.u16   q12, q4
+        vqadd.u16   q1,  q9
+        vqadd.u16   q13, q5
+        vqadd.u16   q2,  q10
+        vqadd.u16   q14, q6
+
+        vqrshrn.u16 d0, q0,  #8
+        vqrshrn.u16 d1, q12, #8
+        vqrshrn.u16 d2, q1,  #8
+        vqrshrn.u16 d3, q13, #8
+        vqrshrn.u16 d4, q2,  #8
+        vqrshrn.u16 d5, q14, #8
+
+        vmov        q3, q11
+.endm
+
+#define params_MULTIPLY zipped=0
+.macro blend_kernel_MULTIPLY
+        vmull.u8    q12, d1, d17
+        vmull.u8    q0,  d0, d16
+        vmull.u8    q13, d3, d19
+        vmull.u8    q1,  d2, d18
+        vmull.u8    q14, d5, d21
+        vmull.u8    q2,  d4, d20
+        vmull.u8    q15, d7, d23
+        vmull.u8    q3,  d6, d22
+
+        vrshrn.u16  d8,  q0,  #8
+        vrshrn.u16  d9,  q12, #8
+        vrshrn.u16  d10, q1,  #8
+        vrshrn.u16  d11, q13, #8
+        vrshrn.u16  d12, q2,  #8
+        vrshrn.u16  d13, q14, #8
+        vrshrn.u16  d14, q3,  #8
+        vrshrn.u16  d15, q15, #8
+
+        vaddw.u8    q0,  d8
+        vaddw.u8    q12, d9
+        vaddw.u8    q1,  d10
+        vaddw.u8    q13, d11
+        vaddw.u8    q2,  d12
+        vaddw.u8    q14, d13
+        vaddw.u8    q3,  d14
+        vaddw.u8    q15, d15
+
+        vrshrn.u16  d0, q0,  #8
+        vrshrn.u16  d1, q12, #8
+        vrshrn.u16  d2, q1,  #8
+        vrshrn.u16  d3, q13, #8
+        vrshrn.u16  d4, q2,  #8
+        vrshrn.u16  d5, q14, #8
+        vrshrn.u16  d6, q3,  #8
+        vrshrn.u16  d7, q15, #8
+.endm
+
+#define params_ADD zipped=0
+.macro blend_kernel_ADD
+        vqadd.u8 q0, q0, q8
+        vqadd.u8 q1, q1, q9
+        vqadd.u8 q2, q2, q10
+        vqadd.u8 q3, q3, q11
+.endm
+
+#define params_SUBTRACT zipped=0
+.macro blend_kernel_SUBTRACT
+        vqsub.u8 q0, q0, q8
+        vqsub.u8 q1, q1, q9
+        vqsub.u8 q2, q2, q10
+        vqsub.u8 q3, q3, q11
+.endm
+
+#define params_DIFFERENCE zipped=0
+.macro blend_kernel_DIFFERENCE
+        vabd.u8 q0, q0, q8
+        vabd.u8 q1, q1, q9
+        vabd.u8 q2, q2, q10
+        vabd.u8 q3, q3, q11
+.endm
+
+#define params_XOR zipped=0
+.macro blend_kernel_XOR
+        veor    q0, q0, q8
+        veor    q1, q1, q9
+        veor    q2, q2, q10
+        veor    q3, q3, q11
+.endm
+
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Various sections of assembly code are dropped or substituted for
+ * simpler operations if they're not needed.
+ */
+.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
+.if \nowrap
+        \kernel
+.else
+        vpush   {d8-d15}
+        subs    r2, #64
+        b       2f
+        .align 4
+1:
+  .if \lddst
+    .if \zipped
+        vld4.8  {d0,d2,d4,d6}, [r0]!
+        vld4.8  {d1,d3,d5,d7}, [r0]!
+    .else
+        vld1.8  {d0-d3}, [r0]!
+        vld1.8  {d4-d7}, [r0]!
+    .endif
+        sub     r0, #64
+  .endif
+  .if \ldsrc
+    .if \zipped
+        vld4.8  {d16,d18,d20,d22}, [r1]!
+        vld4.8  {d17,d19,d21,d23}, [r1]!
+    .else
+        vld1.8  {d16-d19}, [r1]!
+        vld1.8  {d20-d23}, [r1]!
+    .endif
+  .endif
+  .if \pld
+    .if \lddst ; pld [r0, #192] ; .endif
+    .if \ldsrc ; pld [r1, #192] ; .endif
+  .endif
+
+        \kernel
+
+        subs    r2, #64
+  .if \zipped
+        vst4.8  {d0,d2,d4,d6}, [r0]!
+        vst4.8  {d1,d3,d5,d7}, [r0]!
+  .else
+        vst1.8  {d0-d3}, [r0]!
+        vst1.8  {d4-d7}, [r0]!
+  .endif
+
+2:      bge     1b
+        adds    r2, #64
+        beq     2f
+
+        /* To handle the tail portion of the data (something less than 64
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the operations
+         * don't require data to interact with its neighbours.
+         */
+        vmov.i8 q0, #0
+        vmov.i8 q1, #0
+        vmov.i8 q2, #0
+        vmov.i8 q3, #0
+
+        vmov.i8 q8, #0
+        vmov.i8 q9, #0
+        vmov.i8 q10, #0
+        vmov.i8 q11, #0
+
+        tst     r2, #32
+        beq     1f
+  .if \lddst ; vld1.64 {d4-d7}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.64 {d20-d23}, [r1]! ; .endif
+1:      tst     r2, #16
+        beq     1f
+  .if \lddst ; vld1.64 {d2-d3}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.64 {d18-d19}, [r1]! ; .endif
+1:      tst     r2, #8
+        beq     1f
+  .if \lddst ; vld1.64 {d1}, [r0]!      ; .endif
+  .if \ldsrc ; vld1.64 {d17}, [r1]!     ; .endif
+1:      tst     r2, #4
+        beq     1f
+  .if \lddst ; vld1.32 {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.32 {d16[1]}, [r1]!  ; .endif
+1:      tst     r2, #2
+        beq     1f
+  .if \lddst ; vld1.16 {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.16 {d16[1]}, [r1]!  ; .endif
+1:      tst     r2, #1
+        beq     1f
+  .if \lddst ; vld1.8  {d0[1]}, [r0]!   ; .endif
+  .if \ldsrc ; vld1.8  {d16[1]}, [r1]!  ; .endif
+1:
+  .if \lddst ; sub     r0, r2           ; .endif
+
+  .if \zipped
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point.
+         */
+        vuzp.8  q0, q1
+        vuzp.8  q2, q3
+        vuzp.8  q0, q2
+        vuzp.8  q1, q3
+
+        vuzp.8  q8, q9
+        vuzp.8  q10, q11
+        vuzp.8  q8, q10
+        vuzp.8  q9, q11
+
+        \kernel
+
+        vzip.8  q0, q2
+        vzip.8  q1, q3
+        vzip.8  q0, q1
+        vzip.8  q2, q3
+  .else
+        \kernel
+  .endif
+
+        tst     r2, #32
+        beq     1f
+        vst1.64 {d4-d7}, [r0]!
+1:      tst     r2, #16
+        beq     1f
+        vst1.64 {d2-d3}, [r0]!
+1:      tst     r2, #8
+        beq     1f
+        vst1.64 {d1}, [r0]!
+1:      tst     r2, #4
+        beq     1f
+        vst1.32 {d0[1]}, [r0]!
+1:      tst     r2, #2
+        beq     1f
+        vst1.16 {d0[1]}, [r0]!
+1:      tst     r2, #1
+        beq     2f
+        vst1.8  {d0[1]}, [r0]!
+2:      vpop    {d8-d15}
+.endif
+        mov     r0, #0
+        bx      lr
+.endm
+
+
+/* produce list of blend_line_XX() functions; each function uses the wrap_line
+ * macro, passing it the name of the operation macro it wants along with
+ * optional parameters to remove unnecessary operations.
+ */
+#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
+    BLEND_LIST(BLEND_X)
+#undef BLEND_X
+
+
+/*  int rsdIntrinsicBlend_K(
+ *          uchar4 *out,        // r0
+ *          uchar4 const *in,   // r1
+ *          int slot,           // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicBlend_K)
+    adr     ip, blend_functions
+    cmp     r2, #(blend_functions_end - blend_functions) >> 2
+    ldrlo   ip, [ip, r2, LSL #2]
+    movhs   ip, #0
+    ldr     r2, [sp]
+    add     r0, r3, LSL #2
+    add     r1, r3, LSL #2
+    sub     r2, r3
+    mov     r2, r2, LSL #2
+    cmp     ip, #0
+    addne   ip, ip, pc
+    bxne    ip
+1:  mov     r0, #-1
+    bx      lr
+
+blend_functions:
+.set off,0
+#define BLEND_X(d, n) .rept d-off ; .word 0 ; .endr ; .word blend_line_##n-1b ; .set off, d+1 ;
+        BLEND_LIST(BLEND_X)
+#undef BLEND_X
+blend_functions_end:
+
+END(rsdIntrinsicBlend_K)
diff --git a/renderscript-toolkit/src/main/cpp/Blur.cpp b/renderscript-toolkit/src/main/cpp/Blur.cpp
new file mode 100644
index 0000000..3b6fd01
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blur.cpp
@@ -0,0 +1,542 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cmath>
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Blur"
+
+/**
+ * Blurs an image or a section of an image.
+ *
+ * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
+ */
+class BlurTask : public Task {
+    // The image we're blurring.
+    const uchar* mIn;
+    // Where we store the blurred image.
+    uchar* outArray;
+    // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
+    // So, the max kernel size is 51 (= 2 * 25 + 1).
+    // Considering SSSE3 case, which requires the size is multiple of 4,
+    // at least 52 words are necessary. Values outside of the kernel should be 0.
+    float mFp[104];
+    uint16_t mIp[104];
+
+    // Working area to store the result of the vertical blur, to be used by the horizontal pass.
+    // There's one area per thread. Since the needed working area may be too large to put on the
+    // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
+    // tile, we cache the scratch area here.
+    std::vector<void*> mScratch;       // Pointers to the scratch areas, one per thread.
+    std::vector<size_t> mScratchSize;  // The size in bytes of the scratch areas, one per thread.
+
+    // The radius of the blur, in floating point and integer format.
+    float mRadius;
+    int mIradius;
+
+    void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+                  uint32_t threadIndex);
+    void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void ComputeGaussianWeights();
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
+             uint32_t threadCount, float radius, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction},
+          mIn{in},
+          outArray{out},
+          mScratch{threadCount},
+          mScratchSize{threadCount},
+          mRadius{std::min(25.0f, radius)} {
+        ComputeGaussianWeights();
+    }
+
+    ~BlurTask() {
+        for (size_t i = 0; i < mScratch.size(); i++) {
+            if (mScratch[i]) {
+                free(mScratch[i]);
+            }
+        }
+    }
+};
+
+void BlurTask::ComputeGaussianWeights() {
+    memset(mFp, 0, sizeof(mFp));
+    memset(mIp, 0, sizeof(mIp));
+
+    // Compute gaussian weights for the blur
+    // e is the euler's number
+    float e = 2.718281828459045f;
+    float pi = 3.1415926535897932f;
+    // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
+    // x is of the form [-radius .. 0 .. radius]
+    // and sigma varies with the radius.
+    // Based on some experimental radius values and sigmas,
+    // we approximately fit sigma = f(radius) as
+    // sigma = radius * 0.4  + 0.6
+    // The larger the radius gets, the more our gaussian blur
+    // will resemble a box blur since with large sigma
+    // the gaussian curve begins to lose its shape
+    float sigma = 0.4f * mRadius + 0.6f;
+
+    // Now compute the coefficients. We will store some redundant values to save
+    // some math during the blur calculations precompute some values
+    float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
+    float coeff2 = - 1.0f / (2.0f * sigma * sigma);
+
+    float normalizeFactor = 0.0f;
+    float floatR = 0.0f;
+    int r;
+    mIradius = (float)ceil(mRadius) + 0.5f;
+    for (r = -mIradius; r <= mIradius; r ++) {
+        floatR = (float)r;
+        mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
+        normalizeFactor += mFp[r + mIradius];
+    }
+
+    // Now we need to normalize the weights because all our coefficients need to add up to one
+    normalizeFactor = 1.0f / normalizeFactor;
+    for (r = -mIradius; r <= mIradius; r ++) {
+        mFp[r + mIradius] *= normalizeFactor;
+        mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
+    }
+}
+
+/**
+ * Vertical blur of a uchar4 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
+                   int iStride, const float* gPtr, int iradius) {
+    const uchar *pi = ptrIn + x*4;
+
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = std::max((y + r), 0);
+        validY = std::min(validY, (int)(sizeY - 1));
+        const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
+        float4 pf = convert<float4>(pvy[0]);
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = blurredPixel;
+}
+
+/**
+ * Vertical blur of a uchar1 line.
+ *
+ * @param sizeY Number of cells of the input array in the vertical direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param y Coordinate of the point we're blurring.
+ * @param ptrIn Start of the input array.
+ * @param iStride The size in byte of a row of the input array.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
+                   const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
+
+    const uchar *pi = ptrIn + x;
+
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validY = std::max((y + r), 0);
+        validY = std::min(validY, (int)(sizeY - 1));
+        float pf = (float)pi[validY * iStride];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = blurredPixel;
+}
+
+
+extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
+                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
+                 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
+                                   int rct, int x1, int ct);
+extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+                                   int ct);
+extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
+                                   int ct);
+#endif
+
+/**
+ * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
+                    int x2, bool usesSimd) {
+    int x1 = 0;
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (usesSimd) {
+        int t = (x2 - x1);
+        t &= ~1;
+        if (t) {
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
+        }
+        x1 += t;
+        out += t;
+        ptrIn += t << 2;
+    }
+#else
+    (void) usesSimd; // Avoid unused parameter warning.
+#endif
+    while(x2 > x1) {
+        const uchar *pi = ptrIn;
+        float4 blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out->xyzw = blurredPixel;
+        x1++;
+        out++;
+        ptrIn+=4;
+    }
+}
+
+/**
+ * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
+ * dealing with boundary conditions.
+ *
+ * @param out Where to store the results. This is the input to the horizontal blur.
+ * @param ptrIn The input data for this line.
+ * @param iStride The width of the input.
+ * @param gPtr The gaussian coefficients.
+ * @param ct The diameter of the blur.
+ * @param len How many cells to blur.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
+                    bool usesSimd) {
+    int x1 = 0;
+
+    while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
+        const uchar *pi = ptrIn;
+        float blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float pf = (float)pi[0];
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out[0] = blurredPixel;
+        x1++;
+        out++;
+        ptrIn++;
+        len--;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (usesSimd && (len > x1)) {
+        int t = (len - x1) >> 2;
+        t &= ~1;
+        if (t) {
+            rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
+            len -= t << 2;
+            ptrIn += t << 2;
+            out += t << 2;
+        }
+    }
+#else
+    (void) usesSimd; // Avoid unused parameter warning.
+#endif
+    while(len > 0) {
+        const uchar *pi = ptrIn;
+        float blurredPixel = 0;
+        const float* gp = gPtr;
+
+        for (int r = 0; r < ct; r++) {
+            float pf = (float)pi[0];
+            blurredPixel += pf * gp[0];
+            pi += iStride;
+            gp++;
+        }
+        out[0] = blurredPixel;
+        len--;
+        out++;
+        ptrIn++;
+    }
+}
+
+/**
+ * Horizontal blur of a uchar4 line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
+                   int iradius) {
+    float4 blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = std::max((x + r), 0);
+        validX = std::min(validX, (int)(sizeX - 1));
+        float4 pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out->xyzw = convert<uchar4>(blurredPixel);
+}
+
+/**
+ * Horizontal blur of a uchar line.
+ *
+ * @param sizeX Number of cells of the input array in the horizontal direction.
+ * @param out Where to place the computed value.
+ * @param x Coordinate of the point we're blurring.
+ * @param ptrIn The start of the input row from which we're indexing x.
+ * @param gPtr The gaussian coefficients.
+ * @param iradius The radius of the blur.
+ */
+static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
+                   int iradius) {
+    float blurredPixel = 0;
+    for (int r = -iradius; r <= iradius; r ++) {
+        int validX = std::max((x + r), 0);
+        validX = std::min(validX, (int)(sizeX - 1));
+        float pf = ptrIn[validX];
+        blurredPixel += pf * gPtr[0];
+        gPtr++;
+    }
+
+    out[0] = (uchar)blurredPixel;
+}
+
+/**
+ * Full blur of a line of RGBA data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend  The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ * @param usesSimd Whether this processor supports SIMD.
+ */
+void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
+                        uint32_t threadIndex) {
+    float4 stackbuf[2048];
+    float4 *buf = &stackbuf[0];
+    const uint32_t stride = mSizeX * mVectorSize;
+
+    uchar4 *out = (uchar4 *)outPtr;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && mSizeX >= 4) {
+      rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
+                 mSizeX, mSizeY,
+                 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+        return;
+    }
+#endif
+
+    if (mSizeX > 2048) {
+        if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
+            // Pad the side of the allocation by one unit to allow alignment later
+            mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
+            mScratchSize[threadIndex] = mSizeX;
+        }
+        // realloc only aligns to 8 bytes so we manually align to 16.
+        buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
+    }
+    float4 *fout = (float4 *)buf;
+    int y = currentY;
+    if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
+        const uchar *pi = mIn + (y - mIradius) * stride;
+        OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+    } else {
+        x1 = 0;
+        while(mSizeX > x1) {
+            OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
+        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (mUsesSimd) {
+        if ((x1 + mIradius) < x2) {
+            rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
+                                   mIradius * 2 + 1, x1, x2 - mIradius);
+            out += (x2 - mIradius) - x1;
+            x1 = x2 - mIradius;
+        }
+    }
+#endif
+    while(x2 > x1) {
+        OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+}
+
+/**
+ * Full blur of a line of U_8 data.
+ *
+ * @param outPtr Where to store the results
+ * @param xstart The index of the section we're starting to blur.
+ * @param xend  The end index of the section.
+ * @param currentY The index of the line we're blurring.
+ */
+void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    float buf[4 * 2048];
+    const uint32_t stride = mSizeX * mVectorSize;
+
+    uchar *out = (uchar *)outPtr;
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && mSizeX >= 16) {
+        // The specialisation for r<=8 has an awkward prefill case, which is
+        // fiddly to resolve, where starting close to the right edge can cause
+        // a read beyond the end of input.  So avoid that case here.
+        if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
+            rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
+                     stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
+            return;
+        }
+    }
+#endif
+
+    float *fout = (float *)buf;
+    int y = currentY;
+    if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
+        const uchar *pi = mIn + (y - mIradius) * stride;
+        OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
+    } else {
+        x1 = 0;
+        while(mSizeX > x1) {
+            OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
+            fout++;
+            x1++;
+        }
+    }
+
+    x1 = xstart;
+    while ((x1 < x2) &&
+           ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
+        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if (mUsesSimd) {
+        if ((x1 + mIradius) < x2) {
+            uint32_t len = x2 - (x1 + mIradius);
+            len &= ~3;
+
+            // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
+            // nees to ensure four more values can be accessed in order to avoid accessing
+            // uninitialized buffer.
+            if (len > 4) {
+                len -= 4;
+                rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
+                                       mIradius * 2 + 1, x1, x1 + len);
+                out += len;
+                x1 += len;
+            }
+        }
+    }
+#endif
+    while(x2 > x1) {
+        OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
+        out++;
+        x1++;
+    }
+}
+
+void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                           size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
+        if (mVectorSize == 4) {
+            kernelU4(outPtr, startX, endX, y, threadIndex);
+        } else {
+            kernelU1(outPtr, startX, endX, y);
+        }
+    }
+}
+
+void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
+                               size_t vectorSize, int radius, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (radius <= 0 || radius > 25) {
+        ALOGE("The radius should be between 1 and 25. %d provided.", radius);
+    }
+    if (vectorSize != 1 && vectorSize != 4) {
+        ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
+    }
+#endif
+
+    BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
+                  restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Blur_advsimd.S b/renderscript-toolkit/src/main/cpp/Blur_advsimd.S
new file mode 100644
index 0000000..6d3cb8d
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blur_advsimd.S
@@ -0,0 +1,1868 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define PRIVATE(f) .text; .align 4; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+//#define ARCH_ARM64_USE_BLUR_PRELOAD
+
+/* Number of fractional bits to preserve in intermediate results.  The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+            \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM64_USE_BLUR_PRELOAD)
+#define VERTPLD(...) prfm        PLDL1KEEP, [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file.  If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first.  This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius.  Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ *      x1 -- src
+ *      x2 -- pitch
+ *      x5 -- r
+ *      x6 -- rup (r, unless clipped to top of source image)
+ *      x7 -- rdn (r, unless clipped to bottom of source image)
+ *      x12 -- switch index
+ *      v0-v3 -- coefficient table
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Output:
+ *      x1 += 16
+ *      v10,v11 -- 16 convolved columns
+ * Modifies:
+ *      x10 = upper row pointer
+ *      x11 = lower row pointer
+ *      v12-v15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
+  .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+            ld1         {v15.16b}, [x1], #16
+            mov         x10, x15
+
+            uxtl        v14.8h, v15.8b
+            VERTPLD(x1, #16)
+            uxtl2       v15.8h, v15.16b
+  .if \max_r < 16 // approximate
+    ifcc    adr         \reg, 1f
+  .else
+    ifcc    adrp        \reg, 1f
+    ifcc    add         \reg, \reg, #:lo12:1f
+  .endif
+
+            umull       v12.4s, v14.4h, v0.h[0]
+    ifcc    sub         \reg, \reg, x5, LSL #6
+            umull2      v13.4s, v14.8h, v0.h[0]
+            mov         x11, x19
+            umull       v14.4s, v15.4h, v0.h[0]
+    ifcc    add         \reg, \reg, x5, LSL #3
+            umull2      v15.4s, v15.8h, v0.h[0]
+            br          \reg
+
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
+            ld1         {v10.16b}, [x10], x2
+            ld1         {v11.16b}, [x11], x13
+            uaddl       v16.8h, v10.8b, v11.8b
+            uaddl2      v11.8h, v10.16b, v11.16b
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
+    .endif
+  .endm
+
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            ld1         {v10.16b}, [x10], x2
+            cmp         x6, #\i
+            ld1         {v11.16b}, [x11], x13
+            csel        x10, x15, x10, lo
+            uaddl       v16.8h, v10.8b, v11.8b
+            cmp         x7, #\i
+            uaddl2      v11.8h, v10.16b, v11.16b
+            csel        x11, x19, x11, lo
+            umlal       v12.4s, v16.4h, \dreg
+            umlal2      v13.4s, v16.8h, \dreg
+            VERTPLD(x10, #32)
+            umlal       v14.4s, v11.4h, \dreg
+            VERTPLD(x11, #32)
+            umlal2      v15.4s, v11.8h, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, v3.h[3]
+  vertfetch_clamped 26, v3.h[2]
+  vertfetch_clamped 25, v3.h[1]
+  vertfetch_clamped 24, v3.h[0]
+  vertfetch_clamped 23, v2.h[7]
+  vertfetch_clamped 22, v2.h[6]
+  vertfetch_clamped 21, v2.h[5]
+  vertfetch_clamped 20, v2.h[4]
+  vertfetch_clamped 19, v2.h[3]
+  vertfetch_clamped 18, v2.h[2]
+  vertfetch_clamped 17, v2.h[1]
+  vertfetch_clamped 16, v2.h[0]
+  vertfetch_clamped 15, v1.h[7]
+  vertfetch_clamped 14, v1.h[6]
+  vertfetch_clamped 13, v1.h[5]
+  vertfetch_clamped 12, v1.h[4]
+  vertfetch_clamped 11, v1.h[3]
+  vertfetch_clamped 10, v1.h[2]
+  vertfetch_clamped  9, v1.h[1]
+  vertfetch_clamped  8, v1.h[0]
+  vertfetch_clamped  7, v0.h[7]
+  vertfetch_clamped  6, v0.h[6]
+  vertfetch_clamped  5, v0.h[5]
+  vertfetch_clamped  4, v0.h[4]
+  vertfetch_clamped  3, v0.h[3]
+  vertfetch_clamped  2, v0.h[2]
+  vertfetch_clamped  1, v0.h[1]
+  vertfetch_clamped  0, v0.h[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, v3.h[3]
+  vertfetch_noclamp 26, v3.h[2]
+  vertfetch_noclamp 25, v3.h[1]
+  vertfetch_noclamp 24, v3.h[0]
+  vertfetch_noclamp 23, v2.h[7]
+  vertfetch_noclamp 22, v2.h[6]
+  vertfetch_noclamp 21, v2.h[5]
+  vertfetch_noclamp 20, v2.h[4]
+  vertfetch_noclamp 19, v2.h[3]
+  vertfetch_noclamp 18, v2.h[2]
+  vertfetch_noclamp 17, v2.h[1]
+  vertfetch_noclamp 16, v2.h[0]
+  vertfetch_noclamp 15, v1.h[7]
+  vertfetch_noclamp 14, v1.h[6]
+  vertfetch_noclamp 13, v1.h[5]
+  vertfetch_noclamp 12, v1.h[4]
+  vertfetch_noclamp 11, v1.h[3]
+  vertfetch_noclamp 10, v1.h[2]
+  vertfetch_noclamp  9, v1.h[1]
+  vertfetch_noclamp  8, v1.h[0]
+  vertfetch_noclamp  7, v0.h[7]
+  vertfetch_noclamp  6, v0.h[6]
+  vertfetch_noclamp  5, v0.h[5]
+  vertfetch_noclamp  4, v0.h[4]
+  vertfetch_noclamp  3, v0.h[3]
+  vertfetch_noclamp  2, v0.h[2]
+  vertfetch_noclamp  1, v0.h[1]
+  vertfetch_noclamp  0, v0.h[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        uqrshrn     v10.4h, v12.4s, #16 - FRACTION_BITS
+            add         x15, x15, #16
+            uqrshrn2    v10.8h, v13.4s, #16 - FRACTION_BITS
+            add         x19, x19, #16
+            uqrshrn     v11.4h, v14.4s, #16 - FRACTION_BITS
+            uqrshrn2    v11.8h, v15.4s, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses.  This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data.  This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [x9] is used.
+ *
+ * Input:
+ *      v16-v31,v4-v11 -- convoltion window
+ *      x9 -- pointer to additional convolution window data
+ * Output:
+ *      x9 -- updated buffer pointer (if used)
+ *      d31 -- result to be stored
+ * Modifies:
+ *      x12 -- temp buffer pointer
+ *      v12-v13 -- temporaries for load and vext operations.
+ *      v14-v15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+
+.rodata
+    200:    .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .align      4
+.text
+            umull       v14.4s, v9.4h, v0.h[0]
+            umull2      v15.4s, v9.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    108:    umlal       v14.4s, v8.4h, v1.h[0]
+            umlal2      v15.4s, v8.8h, v1.h[0]
+            umlal       v14.4s, v10.4h, v1.h[0]
+            umlal2      v15.4s, v10.8h, v1.h[0]
+    107:    ext         v12.16b, v8.16b, v9.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v8.16b, v9.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v8.16b, v9.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    //ext         v12.16b, v8.16b, v9.16b, #4*2
+            //ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal2      v14.4s, v8.8h, v0.h[4]
+            umlal       v15.4s, v9.4h, v0.h[4]
+            umlal2      v14.4s, v9.8h, v0.h[4]
+            umlal       v15.4s, v10.4h, v0.h[4]
+    103:    ext         v12.16b, v8.16b, v9.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v8.16b, v9.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v8.16b, v9.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .align 4
+
+.text
+            umull       v14.4s, v8.4h, v0.h[0]
+            umull2      v15.4s, v8.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    116:    //ext         v12.16b, v6.16b, v7.16b, #0*2
+            //ext         v13.16b, v10.16b, v11.16b, #0*2
+            umlal       v14.4s, v6.4h, v2.h[0]
+            umlal2      v15.4s, v6.8h, v2.h[0]
+            umlal       v14.4s, v10.4h, v2.h[0]
+            umlal2      v15.4s, v10.8h, v2.h[0]
+    115:    ext         v12.16b, v6.16b, v7.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v1.h[7]
+            umlal2      v15.4s, v12.8h, v1.h[7]
+            umlal       v14.4s, v13.4h, v1.h[7]
+            umlal2      v15.4s, v13.8h, v1.h[7]
+    114:    ext         v12.16b, v6.16b, v7.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v1.h[6]
+            umlal2      v15.4s, v12.8h, v1.h[6]
+            umlal       v14.4s, v13.4h, v1.h[6]
+            umlal2      v15.4s, v13.8h, v1.h[6]
+    113:    ext         v12.16b, v6.16b, v7.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v1.h[5]
+            umlal2      v15.4s, v12.8h, v1.h[5]
+            umlal       v14.4s, v13.4h, v1.h[5]
+            umlal2      v15.4s, v13.8h, v1.h[5]
+    112:    //ext         v12.16b, v6.16b, v7.16b, #4*2
+            //ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal2      v14.4s, v6.8h, v1.h[4]
+            umlal       v15.4s, v7.4h, v1.h[4]
+            umlal2      v14.4s, v9.8h, v1.h[4]
+            umlal       v15.4s, v10.4h, v1.h[4]
+    111:    ext         v12.16b, v6.16b, v7.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v1.h[3]
+            umlal2      v15.4s, v12.8h, v1.h[3]
+            umlal       v14.4s, v13.4h, v1.h[3]
+            umlal2      v15.4s, v13.8h, v1.h[3]
+    110:    ext         v12.16b, v6.16b, v7.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v1.h[2]
+            umlal2      v15.4s, v12.8h, v1.h[2]
+            umlal       v14.4s, v13.4h, v1.h[2]
+            umlal2      v15.4s, v13.8h, v1.h[2]
+    109:    ext         v12.16b, v6.16b, v7.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v1.h[1]
+            umlal2      v15.4s, v12.8h, v1.h[1]
+            umlal       v14.4s, v13.4h, v1.h[1]
+            umlal2      v15.4s, v13.8h, v1.h[1]
+    108:    //ext         v12.16b, v7.16b, v8.16b, #0*2
+            //ext         v13.16b, v9.16b, v10.16b, #0*2
+            umlal       v14.4s, v7.4h, v1.h[0]
+            umlal2      v15.4s, v7.8h, v1.h[0]
+            umlal       v14.4s, v9.4h, v1.h[0]
+            umlal2      v15.4s, v9.8h, v1.h[0]
+    107:    ext         v12.16b, v7.16b, v8.16b, #1*2
+            ext         v13.16b, v8.16b, v9.16b, #7*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v7.16b, v8.16b, #2*2
+            ext         v13.16b, v8.16b, v9.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v7.16b, v8.16b, #3*2
+            ext         v13.16b, v8.16b, v9.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    //ext         v12.16b, v7.16b, v8.16b, #4*2
+            //ext         v13.16b, v8.16b, v9.16b, #4*2
+            umlal2      v14.4s, v7.8h, v0.h[4]
+            umlal       v15.4s, v8.4h, v0.h[4]
+            umlal2      v14.4s, v8.8h, v0.h[4]
+            umlal       v15.4s, v9.4h, v0.h[4]
+    103:    ext         v12.16b, v7.16b, v8.16b, #5*2
+            ext         v13.16b, v8.16b, v9.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v7.16b, v8.16b, #6*2
+            ext         v13.16b, v8.16b, v9.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v7.16b, v8.16b, #7*2
+            ext         v13.16b, v8.16b, v9.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .hword 121f-100f
+            .hword 122f-100f
+            .hword 123f-100f
+            .hword 124f-100f
+            .hword 125f-100f
+            .align 4
+.text
+            ext         v12.16b, v6.16b, v7.16b, #7*2
+            umull       v14.4s, v12.4h, v0.h[0]
+            umull2      v15.4s, v12.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    125:    ext         v12.16b, v31.16b, v4.16b, #6*2
+            ext         v13.16b, v10.16b, v11.16b, #0*2
+            umlal       v14.4s, v12.4h, v3.h[1]
+            umlal2      v15.4s, v12.8h, v3.h[1]
+            umlal       v14.4s, v13.4h, v3.h[1]
+            umlal2      v15.4s, v13.8h, v3.h[1]
+    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
+            ext         v13.16b, v9.16b, v10.16b, #7*2
+            umlal       v14.4s, v12.4h, v3.h[0]
+            umlal2      v15.4s, v12.8h, v3.h[0]
+            umlal       v14.4s, v13.4h, v3.h[0]
+            umlal2      v15.4s, v13.8h, v3.h[0]
+    123:    ext         v12.16b, v4.16b, v5.16b, #0*2
+            ext         v13.16b, v9.16b, v10.16b, #6*2
+            umlal       v14.4s, v12.4h, v2.h[7]
+            umlal2      v15.4s, v12.8h, v2.h[7]
+            umlal       v14.4s, v13.4h, v2.h[7]
+            umlal2      v15.4s, v13.8h, v2.h[7]
+    122:    ext         v12.16b, v4.16b, v5.16b, #1*2
+            ext         v13.16b, v9.16b, v10.16b, #5*2
+            umlal       v14.4s, v12.4h, v2.h[6]
+            umlal2      v15.4s, v12.8h, v2.h[6]
+            umlal       v14.4s, v13.4h, v2.h[6]
+            umlal2      v15.4s, v13.8h, v2.h[6]
+    121:    ext         v12.16b, v4.16b, v5.16b, #2*2
+            ext         v13.16b, v9.16b, v10.16b, #4*2
+            umlal       v14.4s, v12.4h, v2.h[5]
+            umlal2      v15.4s, v12.8h, v2.h[5]
+            umlal       v14.4s, v13.4h, v2.h[5]
+            umlal2      v15.4s, v13.8h, v2.h[5]
+    120:    ext         v12.16b, v4.16b, v5.16b, #3*2
+            ext         v13.16b, v9.16b, v10.16b, #3*2
+            umlal       v14.4s, v12.4h, v2.h[4]
+            umlal2      v15.4s, v12.8h, v2.h[4]
+            umlal       v14.4s, v13.4h, v2.h[4]
+            umlal2      v15.4s, v13.8h, v2.h[4]
+    119:    ext         v12.16b, v4.16b, v5.16b, #4*2
+            ext         v13.16b, v9.16b, v10.16b, #2*2
+            umlal       v14.4s, v12.4h, v2.h[3]
+            umlal2      v15.4s, v12.8h, v2.h[3]
+            umlal       v14.4s, v13.4h, v2.h[3]
+            umlal2      v15.4s, v13.8h, v2.h[3]
+    118:    ext         v12.16b, v4.16b, v5.16b, #5*2
+            ext         v13.16b, v9.16b, v10.16b, #1*2
+            umlal       v14.4s, v12.4h, v2.h[2]
+            umlal2      v15.4s, v12.8h, v2.h[2]
+            umlal       v14.4s, v13.4h, v2.h[2]
+            umlal2      v15.4s, v13.8h, v2.h[2]
+    117:    ext         v12.16b, v4.16b, v5.16b, #6*2
+            ext         v13.16b, v9.16b, v10.16b, #0*2
+            umlal       v14.4s, v12.4h, v2.h[1]
+            umlal2      v15.4s, v12.8h, v2.h[1]
+            umlal       v14.4s, v13.4h, v2.h[1]
+            umlal2      v15.4s, v13.8h, v2.h[1]
+    116:    ext         v12.16b, v4.16b, v5.16b, #7*2
+            ext         v13.16b, v8.16b, v9.16b, #7*2
+            umlal       v14.4s, v12.4h, v2.h[0]
+            umlal2      v15.4s, v12.8h, v2.h[0]
+            umlal       v14.4s, v13.4h, v2.h[0]
+            umlal2      v15.4s, v13.8h, v2.h[0]
+    115:    ext         v12.16b, v5.16b, v6.16b, #0*2
+            ext         v13.16b, v8.16b, v9.16b, #6*2
+            umlal       v14.4s, v12.4h, v1.h[7]
+            umlal2      v15.4s, v12.8h, v1.h[7]
+            umlal       v14.4s, v13.4h, v1.h[7]
+            umlal2      v15.4s, v13.8h, v1.h[7]
+    114:    ext         v12.16b, v5.16b, v6.16b, #1*2
+            ext         v13.16b, v8.16b, v9.16b, #5*2
+            umlal       v14.4s, v12.4h, v1.h[6]
+            umlal2      v15.4s, v12.8h, v1.h[6]
+            umlal       v14.4s, v13.4h, v1.h[6]
+            umlal2      v15.4s, v13.8h, v1.h[6]
+    113:    ext         v12.16b, v5.16b, v6.16b, #2*2
+            ext         v13.16b, v8.16b, v9.16b, #4*2
+            umlal       v14.4s, v12.4h, v1.h[5]
+            umlal2      v15.4s, v12.8h, v1.h[5]
+            umlal       v14.4s, v13.4h, v1.h[5]
+            umlal2      v15.4s, v13.8h, v1.h[5]
+    112:    ext         v12.16b, v5.16b, v6.16b, #3*2
+            ext         v13.16b, v8.16b, v9.16b, #3*2
+            umlal       v14.4s, v12.4h, v1.h[4]
+            umlal2      v15.4s, v12.8h, v1.h[4]
+            umlal       v14.4s, v13.4h, v1.h[4]
+            umlal2      v15.4s, v13.8h, v1.h[4]
+    111:    ext         v12.16b, v5.16b, v6.16b, #4*2
+            ext         v13.16b, v8.16b, v9.16b, #2*2
+            umlal       v14.4s, v12.4h, v1.h[3]
+            umlal2      v15.4s, v12.8h, v1.h[3]
+            umlal       v14.4s, v13.4h, v1.h[3]
+            umlal2      v15.4s, v13.8h, v1.h[3]
+    110:    ext         v12.16b, v5.16b, v6.16b, #5*2
+            ext         v13.16b, v8.16b, v9.16b, #1*2
+            umlal       v14.4s, v12.4h, v1.h[2]
+            umlal2      v15.4s, v12.8h, v1.h[2]
+            umlal       v14.4s, v13.4h, v1.h[2]
+            umlal2      v15.4s, v13.8h, v1.h[2]
+    109:    ext         v12.16b, v5.16b, v6.16b, #6*2
+            ext         v13.16b, v8.16b, v9.16b, #0*2
+            umlal       v14.4s, v12.4h, v1.h[1]
+            umlal2      v15.4s, v12.8h, v1.h[1]
+            umlal       v14.4s, v13.4h, v1.h[1]
+            umlal2      v15.4s, v13.8h, v1.h[1]
+    108:    ext         v12.16b, v5.16b, v6.16b, #7*2
+            ext         v13.16b, v7.16b, v8.16b, #7*2
+            umlal       v14.4s, v12.4h, v1.h[0]
+            umlal2      v15.4s, v12.8h, v1.h[0]
+            umlal       v14.4s, v13.4h, v1.h[0]
+            umlal2      v15.4s, v13.8h, v1.h[0]
+    107:    ext         v12.16b, v6.16b, v7.16b, #0*2
+            ext         v13.16b, v7.16b, v8.16b, #6*2
+            umlal       v14.4s, v12.4h, v0.h[7]
+            umlal2      v15.4s, v12.8h, v0.h[7]
+            umlal       v14.4s, v13.4h, v0.h[7]
+            umlal2      v15.4s, v13.8h, v0.h[7]
+    106:    ext         v12.16b, v6.16b, v7.16b, #1*2
+            ext         v13.16b, v7.16b, v8.16b, #5*2
+            umlal       v14.4s, v12.4h, v0.h[6]
+            umlal2      v15.4s, v12.8h, v0.h[6]
+            umlal       v14.4s, v13.4h, v0.h[6]
+            umlal2      v15.4s, v13.8h, v0.h[6]
+    105:    ext         v12.16b, v6.16b, v7.16b, #2*2
+            ext         v13.16b, v7.16b, v8.16b, #4*2
+            umlal       v14.4s, v12.4h, v0.h[5]
+            umlal2      v15.4s, v12.8h, v0.h[5]
+            umlal       v14.4s, v13.4h, v0.h[5]
+            umlal2      v15.4s, v13.8h, v0.h[5]
+    104:    ext         v12.16b, v6.16b, v7.16b, #3*2
+            ext         v13.16b, v7.16b, v8.16b, #3*2
+            umlal       v14.4s, v12.4h, v0.h[4]
+            umlal2      v15.4s, v12.8h, v0.h[4]
+            umlal       v14.4s, v13.4h, v0.h[4]
+            umlal2      v15.4s, v13.8h, v0.h[4]
+    103:    ext         v12.16b, v6.16b, v7.16b, #4*2
+            ext         v13.16b, v7.16b, v8.16b, #2*2
+            umlal       v14.4s, v12.4h, v0.h[3]
+            umlal2      v15.4s, v12.8h, v0.h[3]
+            umlal       v14.4s, v13.4h, v0.h[3]
+            umlal2      v15.4s, v13.8h, v0.h[3]
+    102:    ext         v12.16b, v6.16b, v7.16b, #5*2
+            ext         v13.16b, v7.16b, v8.16b, #1*2
+            umlal       v14.4s, v12.4h, v0.h[2]
+            umlal2      v15.4s, v12.8h, v0.h[2]
+            umlal       v14.4s, v13.4h, v0.h[2]
+            umlal2      v15.4s, v13.8h, v0.h[2]
+    101:    ext         v12.16b, v6.16b, v7.16b, #6*2
+            ext         v13.16b, v7.16b, v8.16b, #0*2
+            umlal       v14.4s, v12.4h, v0.h[1]
+            umlal2      v15.4s, v12.8h, v0.h[1]
+            umlal       v14.4s, v13.4h, v0.h[1]
+            umlal2      v15.4s, v13.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12, 20
+.macro hconv4_6/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .align      4
+.text
+            umull       v14.4s, v7.4h, v0.h[0]
+            umull2      v15.4s, v7.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    106:    umlal       v14.4s, v4.4h,  v0.h[6]
+            umlal2      v15.4s, v4.8h,  v0.h[6]
+            umlal       v14.4s, v10.4h, v0.h[6]
+            umlal2      v15.4s, v10.8h, v0.h[6]
+    105:    umlal2      v14.4s, v4.8h,  v0.h[5]
+            umlal       v15.4s, v5.4h, v0.h[5]
+            umlal2      v14.4s, v9.8h, v0.h[5]
+            umlal       v15.4s, v10.4h, v0.h[5]
+    104:    umlal       v14.4s, v5.4h, v0.h[4]
+            umlal2      v15.4s, v5.8h, v0.h[4]
+            umlal       v14.4s, v9.4h, v0.h[4]
+            umlal2      v15.4s, v9.8h, v0.h[4]
+    103:    umlal2      v14.4s, v5.8h, v0.h[3]
+            umlal       v15.4s, v6.4h, v0.h[3]
+            umlal2      v14.4s, v8.8h, v0.h[3]
+            umlal       v15.4s, v9.4h, v0.h[3]
+    102:    umlal       v14.4s, v6.4h, v0.h[2]
+            umlal2      v15.4s, v6.8h, v0.h[2]
+            umlal       v14.4s, v8.4h, v0.h[2]
+            umlal2      v15.4s, v8.8h, v0.h[2]
+    101:    umlal2      v14.4s, v6.8h, v0.h[1]
+            umlal       v15.4s, v7.4h, v0.h[1]
+            umlal2      v14.4s, v7.8h, v0.h[1]
+            umlal       v15.4s, v8.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+.rodata
+   200:     .hword -4 //Might need to remove these...
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .align 4
+.text
+            umull       v14.4s, v4.4h, v0.h[0]
+            umull2      v15.4s, v4.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    112:    umlal       v14.4s, v26.4h, v1.h[4]
+            umlal2      v15.4s, v26.8h, v1.h[4]
+            umlal       v14.4s, v10.4h, v1.h[4]
+            umlal2      v15.4s, v10.8h, v1.h[4]
+    111:    umlal2      v14.4s, v26.8h, v1.h[3]
+            umlal       v15.4s, v27.4h, v1.h[3]
+            umlal2      v14.4s, v9.8h, v1.h[3]
+            umlal       v15.4s, v10.4h, v1.h[3]
+    110:    umlal       v14.4s, v27.4h, v1.h[2]
+            umlal2      v15.4s, v27.8h, v1.h[2]
+            umlal       v14.4s, v9.4h, v1.h[2]
+            umlal2      v15.4s, v9.8h, v1.h[2]
+    109:    umlal2      v14.4s, v27.8h, v1.h[1]
+            umlal       v15.4s, v28.4h, v1.h[1]
+            umlal2      v14.4s, v8.8h, v1.h[1]
+            umlal       v15.4s, v9.4h, v1.h[1]
+    108:    umlal       v14.4s, v28.4h, v1.h[0]
+            umlal2      v15.4s, v28.8h, v1.h[0]
+            umlal       v14.4s, v8.4h, v1.h[0]
+            umlal2      v15.4s, v8.8h, v1.h[0]
+    107:    umlal2      v14.4s, v28.8h, v0.h[7]
+            umlal       v15.4s, v29.4h, v0.h[7]
+            umlal2      v14.4s, v7.8h, v0.h[7]
+            umlal       v15.4s, v8.4h, v0.h[7]
+    106:    umlal       v14.4s, v29.4h, v0.h[6]
+            umlal2      v15.4s, v29.8h, v0.h[6]
+            umlal       v14.4s, v7.4h, v0.h[6]
+            umlal2      v15.4s, v7.8h, v0.h[6]
+    105:    umlal2      v14.4s, v29.8h, v0.h[5]
+            umlal       v15.4s, v30.4h, v0.h[5]
+            umlal2      v14.4s, v6.8h, v0.h[5]
+            umlal       v15.4s, v7.4h, v0.h[5]
+    104:    umlal       v14.4s, v30.4h, v0.h[4]
+            umlal2      v15.4s, v30.8h, v0.h[4]
+            umlal       v14.4s, v6.4h, v0.h[4]
+            umlal2      v15.4s, v6.8h, v0.h[4]
+    103:    umlal2      v14.4s, v30.8h, v0.h[3]
+            umlal       v15.4s, v31.4h, v0.h[3]
+            umlal2      v14.4s, v5.8h, v0.h[3]
+            umlal       v15.4s, v6.4h, v0.h[3]
+    102:    umlal       v14.4s, v31.4h, v0.h[2]
+            umlal2      v15.4s, v31.8h, v0.h[2]
+            umlal       v14.4s, v5.4h, v0.h[2]
+            umlal2      v15.4s, v5.8h, v0.h[2]
+    101:    umlal2      v14.4s, v31.8h, v0.h[1]
+            umlal       v15.4s, v4.4h,  v0.h[1]
+            umlal2      v14.4s, v4.8h,  v0.h[1]
+            umlal       v15.4s, v5.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_20/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .align 4
+.text
+            umull       v14.4s, v28.4h, v0.h[0]
+            umull2      v15.4s, v28.8h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    120:    umlal       v14.4s, v18.4h, v2.h[4]
+            umlal2      v15.4s, v18.8h, v2.h[4]
+            umlal       v14.4s, v10.4h, v2.h[4]
+            umlal2      v15.4s, v10.8h, v2.h[4]
+    119:    umlal2      v14.4s, v18.8h, v2.h[3]
+            umlal       v15.4s, v19.4h, v2.h[3]
+            umlal2      v14.4s, v9.8h,  v2.h[3]
+            umlal       v15.4s, v10.4h, v2.h[3]
+    118:    umlal       v14.4s, v19.4h, v2.h[2]
+            umlal2      v15.4s, v19.8h, v2.h[2]
+            umlal       v14.4s, v9.4h,  v2.h[2]
+            umlal2      v15.4s, v9.8h,  v2.h[2]
+    117:    umlal2      v14.4s, v19.8h, v2.h[1]
+            umlal       v15.4s, v20.4h, v2.h[1]
+            umlal2      v14.4s, v8.8h,  v2.h[1]
+            umlal       v15.4s, v9.4h,  v2.h[1]
+    116:    umlal       v14.4s, v20.4h, v2.h[0]
+            umlal2      v15.4s, v20.8h, v2.h[0]
+            umlal       v14.4s, v8.4h,  v2.h[0]
+            umlal2      v15.4s, v8.8h,  v2.h[0]
+    115:    umlal2      v14.4s, v20.8h, v1.h[7]
+            umlal       v15.4s, v21.4h, v1.h[7]
+            umlal2      v14.4s, v7.8h,  v1.h[7]
+            umlal       v15.4s, v8.4h,  v1.h[7]
+    114:    umlal       v14.4s, v21.4h, v1.h[6]
+            umlal2      v15.4s, v21.8h, v1.h[6]
+            umlal       v14.4s, v7.4h,  v1.h[6]
+            umlal2      v15.4s, v7.8h,  v1.h[6]
+    113:    umlal2      v14.4s, v21.8h, v1.h[5]
+            umlal       v15.4s, v22.4h, v1.h[5]
+            umlal2      v14.4s, v6.8h,  v1.h[5]
+            umlal       v15.4s, v7.4h,  v1.h[5]
+    112:    umlal       v14.4s, v22.4h, v1.h[4]
+            umlal2      v15.4s, v22.8h, v1.h[4]
+            umlal       v14.4s, v6.4h,  v1.h[4]
+            umlal2      v15.4s, v6.8h,  v1.h[4]
+    111:    umlal2      v14.4s, v22.8h, v1.h[3]
+            umlal       v15.4s, v23.4h, v1.h[3]
+            umlal2      v14.4s, v5.8h,  v1.h[3]
+            umlal       v15.4s, v6.4h,  v1.h[3]
+    110:    umlal       v14.4s, v23.4h, v1.h[2]
+            umlal2      v15.4s, v23.8h, v1.h[2]
+            umlal       v14.4s, v5.4h,  v1.h[2]
+            umlal2      v15.4s, v5.8h,  v1.h[2]
+    109:    umlal2      v14.4s, v23.8h, v1.h[1]
+            umlal       v15.4s, v24.4h, v1.h[1]
+            umlal2      v14.4s, v4.8h,  v1.h[1]
+            umlal       v15.4s, v5.4h,  v1.h[1]
+    108:    umlal       v14.4s, v24.4h, v1.h[0]
+            umlal2      v15.4s, v24.8h, v1.h[0]
+            umlal       v14.4s, v4.4h,  v1.h[0]
+            umlal2      v15.4s, v4.8h,  v1.h[0]
+    107:    umlal2      v14.4s, v24.8h, v0.h[7]
+            umlal       v15.4s, v25.4h, v0.h[7]
+            umlal2      v14.4s, v31.8h, v0.h[7]
+            umlal       v15.4s, v4.4h,  v0.h[7]
+    106:    umlal       v14.4s, v25.4h, v0.h[6]
+            umlal2      v15.4s, v25.8h, v0.h[6]
+            umlal       v14.4s, v31.4h, v0.h[6]
+            umlal2      v15.4s, v31.8h, v0.h[6]
+    105:    umlal2      v14.4s, v25.8h, v0.h[5]
+            umlal       v15.4s, v26.4h, v0.h[5]
+            umlal2      v14.4s, v30.8h, v0.h[5]
+            umlal       v15.4s, v31.4h, v0.h[5]
+    104:    umlal       v14.4s, v26.4h, v0.h[4]
+            umlal2      v15.4s, v26.8h, v0.h[4]
+            umlal       v14.4s, v30.4h, v0.h[4]
+            umlal2      v15.4s, v30.8h, v0.h[4]
+    103:    umlal2      v14.4s, v26.8h, v0.h[3]
+            umlal       v15.4s, v27.4h, v0.h[3]
+            umlal2      v14.4s, v29.8h, v0.h[3]
+            umlal       v15.4s, v30.4h, v0.h[3]
+    102:    umlal       v14.4s, v27.4h, v0.h[2]
+            umlal2      v15.4s, v27.8h, v0.h[2]
+            umlal       v14.4s, v29.4h, v0.h[2]
+            umlal2      v15.4s, v29.8h, v0.h[2]
+    101:    umlal2      v14.4s, v27.8h, v0.h[1]
+            umlal       v15.4s, v28.4h, v0.h[1]
+            umlal2      v14.4s, v28.8h, v0.h[1]
+            umlal       v15.4s, v29.4h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            mov         v18.16b, v19.16b
+            mov         v19.16b, v20.16b
+            mov         v20.16b, v21.16b
+            mov         v21.16b, v22.16b
+            mov         v22.16b, v23.16b
+            mov         v23.16b, v24.16b
+            mov         v24.16b, v25.16b
+            mov         v25.16b, v26.16b
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+.rodata
+   200:     .hword -4
+            .hword 101f-100f
+            .hword 102f-100f
+            .hword 103f-100f
+            .hword 104f-100f
+            .hword 105f-100f
+            .hword 106f-100f
+            .hword 107f-100f
+            .hword 108f-100f
+            .hword 109f-100f
+            .hword 110f-100f
+            .hword 111f-100f
+            .hword 112f-100f
+            .hword 113f-100f
+            .hword 114f-100f
+            .hword 115f-100f
+            .hword 116f-100f
+            .hword 117f-100f
+            .hword 118f-100f
+            .hword 119f-100f
+            .hword 120f-100f
+            .hword 121f-100f
+            .hword 122f-100f
+            .hword 123f-100f
+            .hword 124f-100f
+            .hword 125f-100f
+            .align 4
+.text
+            umull2      v14.4s, v25.8h, v0.h[0]
+            umull       v15.4s, v26.4h, v0.h[0]
+
+            adrp        x16, 200b
+            add         x16, x16, :lo12:200b
+            ldrsh       x12, [x16, x5, LSL #1]
+            adr         x16, 100f
+            add         x12, x12, x16
+    100:    br          x12
+    125:    ld1         {v12.8h}, [x9]
+            umlal       v14.4s, v12.4h, v3.h[1]
+            umlal2      v15.4s, v12.8h, v3.h[1]
+            umlal       v14.4s, v10.4h, v3.h[1]
+            umlal2      v15.4s, v10.8h, v3.h[1]
+    124:    add         x12, x9, #0x08
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v3.h[0]
+            umlal       v15.4s, v13.4h, v3.h[0]
+            umlal2      v14.4s, v9.8h,  v3.h[0]
+            umlal       v15.4s, v10.4h, v3.h[0]
+    123:    add         x12, x9, #0x10
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[7]
+            umlal2      v15.4s, v12.8h, v2.h[7]
+            umlal       v14.4s, v9.4h,  v2.h[7]
+            umlal2      v15.4s, v9.8h,  v2.h[7]
+    122:    add         x12, x9, #0x18
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[6]
+            umlal       v15.4s, v13.4h, v2.h[6]
+            umlal2      v14.4s, v8.8h,  v2.h[6]
+            umlal       v15.4s, v9.4h,  v2.h[6]
+    121:    add         x12, x9, #0x20
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[5]
+            umlal2      v15.4s, v12.8h, v2.h[5]
+            umlal       v14.4s, v8.4h,  v2.h[5]
+            umlal2      v15.4s, v8.8h,  v2.h[5]
+    120:    add         x12, x9, #0x28
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12], #8
+            bic         x12, x12, #0x40
+            ld1         {v13.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[4]
+            umlal       v15.4s, v13.4h, v2.h[4]
+            umlal2      v14.4s, v7.8h,  v2.h[4]
+            umlal       v15.4s, v8.4h,  v2.h[4]
+    119:    add         x12, x9, #0x30
+            bic         x12, x12, #0x40
+            ld1         {v12.8h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[3]
+            umlal2      v15.4s, v12.8h, v2.h[3]
+            umlal       v14.4s, v7.4h,  v2.h[3]
+            umlal2      v15.4s, v7.8h,  v2.h[3]
+    118:    add         x12, x9, #0x38
+            bic         x12, x12, #0x40
+            ld1         {v12.4h}, [x12]
+            umlal       v14.4s, v12.4h, v2.h[2]
+            umlal       v15.4s, v17.4h, v2.h[2]
+            umlal2      v14.4s, v6.8h,  v2.h[2]
+            umlal       v15.4s, v7.4h,  v2.h[2]
+    117:    umlal       v14.4s, v17.4h, v2.h[1]
+            umlal2      v15.4s, v17.8h, v2.h[1]
+            umlal       v14.4s, v6.4h,  v2.h[1]
+            umlal2      v15.4s, v6.8h,  v2.h[1]
+    116:    umlal2      v14.4s, v17.8h, v2.h[0]
+            umlal       v15.4s, v18.4h, v2.h[0]
+            umlal2      v14.4s, v5.8h,  v2.h[0]
+            umlal       v15.4s, v6.4h,  v2.h[0]
+    115:    umlal       v14.4s, v18.4h, v1.h[7]
+            umlal2      v15.4s, v18.8h, v1.h[7]
+            umlal       v14.4s, v5.4h,  v1.h[7]
+            umlal2      v15.4s, v5.8h,  v1.h[7]
+    114:    umlal2      v14.4s, v18.8h, v1.h[6]
+            umlal       v15.4s, v19.4h, v1.h[6]
+            umlal2      v14.4s, v4.8h,  v1.h[6]
+            umlal       v15.4s, v5.4h,  v1.h[6]
+    113:    umlal       v14.4s, v19.4h, v1.h[5]
+            umlal2      v15.4s, v19.8h, v1.h[5]
+            umlal       v14.4s, v4.4h,  v1.h[5]
+            umlal2      v15.4s, v4.8h,  v1.h[5]
+    112:    umlal2      v14.4s, v19.8h, v1.h[4]
+            umlal       v15.4s, v20.4h, v1.h[4]
+            umlal2      v14.4s, v31.8h, v1.h[4]
+            umlal       v15.4s, v4.4h,  v1.h[4]
+    111:    umlal       v14.4s, v20.4h, v1.h[3]
+            umlal2      v15.4s, v20.8h, v1.h[3]
+            umlal       v14.4s, v31.4h, v1.h[3]
+            umlal2      v15.4s, v31.8h, v1.h[3]
+    110:    umlal2      v14.4s, v20.8h, v1.h[2]
+            umlal       v15.4s, v21.4h, v1.h[2]
+            umlal2      v14.4s, v30.8h, v1.h[2]
+            umlal       v15.4s, v31.4h, v1.h[2]
+    109:    umlal       v14.4s, v21.4h, v1.h[1]
+            umlal2      v15.4s, v21.8h, v1.h[1]
+            umlal       v14.4s, v30.4h, v1.h[1]
+            umlal2      v15.4s, v30.8h, v1.h[1]
+    108:    umlal2      v14.4s, v21.8h, v1.h[0]
+            umlal       v15.4s, v22.4h, v1.h[0]
+            umlal2      v14.4s, v29.8h, v1.h[0]
+            umlal       v15.4s, v30.4h, v1.h[0]
+    107:    umlal       v14.4s, v22.4h, v0.h[7]
+            umlal2      v15.4s, v22.8h, v0.h[7]
+            umlal       v14.4s, v29.4h, v0.h[7]
+            umlal2      v15.4s, v29.8h, v0.h[7]
+    106:    umlal2      v14.4s, v22.8h, v0.h[6]
+            umlal       v15.4s, v23.4h, v0.h[6]
+            umlal2      v14.4s, v28.8h, v0.h[6]
+            umlal       v15.4s, v29.4h, v0.h[6]
+    105:    umlal       v14.4s, v23.4h, v0.h[5]
+            umlal2      v15.4s, v23.8h, v0.h[5]
+            umlal       v14.4s, v28.4h, v0.h[5]
+            umlal2      v15.4s, v28.8h, v0.h[5]
+    104:    umlal2      v14.4s, v23.8h, v0.h[4]
+            umlal       v15.4s, v24.4h, v0.h[4]
+            umlal2      v14.4s, v27.8h, v0.h[4]
+            umlal       v15.4s, v28.4h, v0.h[4]
+    103:    umlal       v14.4s, v24.4h, v0.h[3]
+            umlal2      v15.4s, v24.8h, v0.h[3]
+            umlal       v14.4s, v27.4h, v0.h[3]
+            umlal2      v15.4s, v27.8h, v0.h[3]
+    102:    umlal2      v14.4s, v24.8h, v0.h[2]
+            umlal       v15.4s, v25.4h, v0.h[2]
+            umlal2      v14.4s, v26.8h, v0.h[2]
+            umlal       v15.4s, v27.4h, v0.h[2]
+    101:    umlal       v14.4s, v25.4h, v0.h[1]
+            umlal2      v15.4s, v25.8h, v0.h[1]
+            umlal       v14.4s, v26.4h, v0.h[1]
+            umlal2      v15.4s, v26.8h, v0.h[1]
+
+            uqrshrn     v14.4h, v14.4s, #16
+            uqrshrn2    v14.8h, v15.4s, #16
+            uqrshrn     v15.8b, v14.8h, #FRACTION_BITS
+
+            st1         {v17.16b}, [x9], #16
+            bic         x9, x9, #0x40
+            mov         v17.16b, v18.16b
+            mov         v18.16b, v19.16b
+            mov         v19.16b, v20.16b
+            mov         v20.16b, v21.16b
+            mov         v21.16b, v22.16b
+            mov         v22.16b, v23.16b
+            mov         v23.16b, v24.16b
+            mov         v24.16b, v25.16b
+            mov         v25.16b, v26.16b
+            mov         v26.16b, v27.16b
+            mov         v27.16b, v28.16b
+            mov         v28.16b, v29.16b
+            mov         v29.16b, v30.16b
+            mov         v30.16b, v31.16b
+            mov         v31.16b, v4.16b
+            mov         v4.16b, v5.16b
+            mov         v5.16b, v6.16b
+            mov         v6.16b, v7.16b
+            mov         v7.16b, v8.16b
+            mov         v8.16b, v9.16b
+            mov         v9.16b, v10.16b
+            mov         v10.16b, v11.16b
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+            stp         x10, x11, [sp, #-16]!
+            fetch
+            ldp         x10, x11, [sp], #16
+            ret
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (x10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in v10 and v11.  v8 and v9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+            stp         x29, x30, [sp, #-16]!
+            bl          fetch_generic_asm
+            dup         v8.8h, v10.h[0]
+            dup         v9.8h, v10.h[0]
+            ands        x12, x10, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            sub         x10, x10, x12
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #64
+            sub         x12, x12, #32
+            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+1:          ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+            stp         x29, x30, [sp, #-16]!
+            bl          fetch_generic_asm
+            dup         v8.2d, v10.d[0]
+            dup         v9.2d, v10.d[0]
+            ands        x12, x10, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            sub         x10, x10, x12
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #64
+            sub         x12, x12, #32
+            st1         {v8.8h, v9.8h, v10.8h,v11.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+1:          ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampleft4)
+
+/* Fetch only the next (x11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in v10 and v11.  v12 and v13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+            stp         x29, x30, [sp, #-16]!
+            sub         x12, xzr, x11
+            ands        x12, x12, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            bl          fetch_generic_asm
+            dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            sub         x12, xzr, x11
+            and         x12, x12, #15
+            sub         sp, sp, #64
+            add         x12, sp, x12, LSL #1
+            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            ldp         x29, x30, [sp], #16
+            ret
+1:          bl          fetch_generic_asm
+            dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+            stp         x29, x30, [sp, #-16]!
+            sub         x12, xzr, x11
+            ands        x12, x12, #15
+            beq         1f
+            sub         x1, x1, x12
+            sub         x15, x15, x12
+            sub         x19, x19, x12
+            bl          fetch_generic_asm
+            dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            sub         x12, xzr, x11
+            and         x12, x12, #15
+            sub         sp, sp, #64
+            add         x12, sp, x12, LSL #1
+            st1         {v10.8h,v11.8h,v12.8h,v13.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            ldp         x29, x30, [sp], #16
+            ret
+1:          bl          fetch_generic_asm
+            dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            ldp         x29, x30, [sp], #16
+            ret
+END(fetch_clampright4)
+
+/* Given values in v10 and v11, and an index in x11, sweep the (x11 & 15)th
+ * value across to fill the rest of the register pair.  Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #1
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.8h}, [x12]
+            ld1r        {v13.8h}, [x12]
+            st1         {v12.8h,v13.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
+            ret
+1:          dup         v12.8h, v11.h[7]
+            dup         v13.8h, v11.h[7]
+            ret
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #4
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.2d}, [x12]
+            st1         {v13.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
+            ret
+1:          dup         v12.2d, v11.d[1]
+            dup         v13.2d, v11.d[1]
+            ret
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line.  This usually stays in the
+ * register file, but spills to memory for large windows.  The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register.  Consequently the fill loops are unrolled to address
+ * the registers directly.  This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb
+  .ifc \ra,xx
+    .ifc \rb,xx
+            st1         {\sra,\srb}, [x9], #32
+    .else
+            bic         x9, x9, #0x40
+            st1         {\sra}, [x9], #16
+            mov         \rb, \srb
+    .endif
+  .else
+    .ifnc \ra,\sra
+            mov         \ra, \sra
+    .endif
+    .ifnc \rb,\srb
+            mov         \rb, \srb
+    .endif
+  .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+    .if windowsize >= (\line * 16)
+      .set i, windowsize - (\line * 16)
+\label\macro\line:
+            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+    .endif
+  .endm
+            ifneeded \macro \nextmacro, 13, 12, xx,      xx,      \step, \label
+            ifneeded \macro \nextmacro, 12, 11, xx,      xx,      \step, \label
+            ifneeded \macro \nextmacro, 11, 10, xx,      v17.16b, \step, \label
+            ifneeded \macro \nextmacro, 10,  9, v18.16b, v19.16b, \step, \label
+            ifneeded \macro \nextmacro,  9,  8, v20.16b, v21.16b, \step, \label
+            ifneeded \macro \nextmacro,  8,  7, v22.16b, v23.16b, \step, \label
+            ifneeded \macro \nextmacro,  7,  6, v24.16b, v25.16b, \step, \label
+            ifneeded \macro \nextmacro,  6,  5, v26.16b, v27.16b, \step, \label
+            ifneeded \macro \nextmacro,  5,  4, v28.16b, v29.16b, \step, \label
+            ifneeded \macro \nextmacro,  4,  3, v30.16b, v31.16b, \step, \label
+            ifneeded \macro \nextmacro,  3,  2, v4.16b,  v5.16b,  \step, \label
+            ifneeded \macro \nextmacro,  2,  1, v6.16b,  v7.16b,  \step, \label
+            ifneeded \macro \nextmacro,  1,  0, v8.16b,  v9.16b,  \step, \label
+\label\macro\()0:
+            b           \label\()_end
+  .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value.  While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+            cmp         x10, #i+16
+            blo         \next
+            prefill_out \ra, \rb, v8.16b, v9.16b
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1:          prefill_out \ra, \rb, v10.16b, v11.16b
+            b           \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+            cmp         x11, #i+16
+            bls         \next
+            bl          fetch_generic_asm
+            prefill_out \ra, \rb, v10.16b, v11.16b
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image.  In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro.  This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+            cmp         x11, #i
+            bls         \next
+            bl          fetch_clampright\step
+            prefill_out \ra, \rb, v10.16b, v11.16b
+            b           \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+            prefill_out \ra, \rb, v12.16b, v13.16b
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+            prefill_list leftfill,  leftedge,   \max_r, \step, \label
+            prefill_list leftedge,  dofetch,    \max_r, \step, \label
+            prefill_list dofetch,   rightedge,  \max_r, \step, \label
+            prefill_list rightedge, rightfill,  \max_r, \step, \label
+            prefill_list rightfill, oops,       \max_r, \step, \label
+\label\()_end:
+.endm
+
+
+/* Fill the convolution window with context data.  The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written.  This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ *      x1 -- src
+ *      x2 -- pitch
+ *      x3 -- count
+ *      x4 -- available image data right of src pointer
+ *      x5 -- r
+ *      x6 -- rup
+ *      x7 -- rdn
+ *      x8 -- available image data left of src pointer
+ *      x9 -- buffer (if needed)
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Output:
+ *      x4 -= min(inlen, count + windowsize - centertap)
+ *      x1 += min(inlen, count + windowsize - centertap)
+ *      x15 += min(inlen, count + windowsize - centertap)
+ *      x19 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ *      x10 -- fill start index in the window
+ *      x11 -- fill stop index in the window
+ *      x12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+            mov         x10, #centertap
+            subs        x10, x10, x8
+            csel        x10, xzr, x10, lo
+
+            subs        x11, x4, #windowsize - centertap
+            csel        x11, xzr, x11, hs
+            add         x11, x11, #windowsize
+
+            /* x10 indicates where in the window legal image data begins.
+             * x11 indicates where in the window legal image date ends.
+             * When starting near the centre of a large image these would be
+             * zero and windowsize respectively, but when starting near the
+             * edges this can change.
+             * When starting on the leftmost pixel, x10 will be centertap.
+             * When starting on the rightmost pixel, x11 will be centertap+1.
+             */
+
+            /* x4 indicates how much data there is between the current pointers
+             * and the right edge of the image.  The pointers currently point
+             * to the data needed at centertap.  The subsequent code will
+             * consume (windowsize - x10) data, but only the data from
+             * centertap to windowsize comes out of x4's budget.
+             */
+1:          subs        x4, x4, #windowsize - centertap
+            csel        x4, xzr, x4, lo
+
+            /* And the pointers need to rewind to the start of the window.
+             */
+            sub         x1, x1, #centertap
+            sub         x15, x15, #centertap
+            sub         x19, x19, #centertap
+
+            /* Unless x8 indicated that there wasn't that much data available.
+             */
+            add         x1, x1, x10
+            add         x15, x15, x10
+            add         x19, x19, x10
+
+            /* Get the first chunk, and add padding to align it to the window
+             * if necessary.
+             */
+            bl          fetch_clampleft\step
+
+            /* Sometimes the start and the end of the window are in the same
+             * chunk.  In that case both ends need filler at the outset.
+             */
+            sub         x12, x11, #1
+            eor         x12,  x10, x12
+            cmp         x12, #16
+            bhs         1f
+            bl          prefill_sweepright\step
+
+            /* Iterate through all the points in the window and fill them in
+             * with padding or image data as needed.
+             */
+1:          prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions.  Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires.  The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first.  In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ *      x0 = dst
+ *      x1 = src
+ *      x2 = pitch
+ *      x3 = count
+ *      x4 = inlen
+ *      x5 = r
+ *      x6 = rup
+ *      x7 = rdn
+ *      x9 = buffer
+ *      x13 = -pitch
+ *      x15 = top-row in
+ *      x19 = bottom-row in
+ * Modifies
+ *      x8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+            /* If x4 >= x3 then there's no need for clipping.  The main loop
+             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+             * no greater than x3 and use x4 for the loop.
+             * However, if x4 comes out of the loop with less than 16 bytes
+             * left, a partial read would be necessary to avoid reading beyond
+             * the end of the image.  To avoid this, clamp x4 to the next
+             * multiple of 16, which is still sufficient to force it out of the
+             * loop but doesn't imply a rewind.
+             */
+            add         x12, x3, #15
+            bic         x12, x12, #15
+            cmp         x4, x12
+            csel        x4, x12, x4, hi
+
+            /* First calculate the entry-point into the internal fetch logic.
+             * This is done so the same function can service several kernel
+             * sizes.
+             */
+            adrp        x8, \labelnc
+            add         x8, x8, #:lo12:\labelnc
+            sub         x8, x8, x5, LSL #5
+            sub         x8, x8, x5, LSL #3
+            cmp         x5, x6
+            ccmp        x5, x7, #0, eq
+            beq         5f
+
+            /* if (r != rup || r != rdn) then the address-clamping table should
+             * be used rather than the short-cut version.
+             */
+            adrp        x8, \labelc
+            add         x8, x8, #:lo12:\labelc
+            sub         x8, x8, x5, LSL #6
+            add         x8, x8, x5, LSL #3
+            b           5f
+
+            /* Main loop: ... */
+            .align  4
+3:          /* first perform a vertical convolution from memory to get the next
+             * 16 taps of the horizontal window into the register file...
+             */
+            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=x8
+
+            /* ...then perform a horizontal convolution on that window to
+             * produce eight output bytes, and slide the window along.
+             * This has to be done twice to match the 16-way vertical pass.
+             * It would be preferable to have twice the work done in \core, but
+             * that would demand yet another variant on those macros and would
+             * perturb the register allocation severely.
+             */
+            \core
+            st1         {v15.8b}, [x0], #8
+            \core
+            st1         {v15.8b}, [x0], #8
+
+            sub         x3, x3, #16
+5:          subs        x4, x4, #16
+            bhi         3b
+            /* Here there's 16 or fewer bytes available before the edge of the
+             * source image.  x4 holds that count minus 16 (because it was
+             * decremented before the first iteration ran).  The last read may
+             * not be a whole chunk, and beyond that a fill value must be used.
+             *
+             * Of course, none of that matters if there's no more output to
+             * produce...
+             */
+            cbz         x3, 5f
+
+            /* Oh well. */
+            adds        x4, x4, #16
+            bne         1f
+  .if \step==1
+            dup         v10.8h, v9.h[7]
+            dup         v11.8h, v9.h[7]
+  .else
+            dup         v10.2d, v9.d[1]
+            dup         v11.2d, v9.d[1]
+  .endif
+            b           3f
+
+            /* To avoid reading past end of input, rewind pointers by (16-x4)
+             * to ensure that they're exactly 16 bytes from the edge.
+             */
+1:          mov         x11, x4
+            bl          fetch_clampright\step
+            /* Now to put this padding to use, perform any remaining
+             * iterations.  This is done at half the rate of the main loop,
+             * because there's no longer pressure from a 16-lane window filler.
+             */
+3:          \core
+  .if \step==1
+            dup         v11.8h, v11.h[7]
+  .else
+            dup         v11.2d, v11.d[1]
+  .endif
+            subs        x3, x3, #8
+            blo         4f
+            st1         {v15.8b}, [x0], #8
+            bne         3b
+            b           5f
+
+            /* If the final iteration contained 0 < l < 8 values, then perform
+             * a piecewise store of the final vector.
+             */
+4:          tbz         x3, #2, 1f
+            st1         {v15.s}[0], [x0], #4
+            ext         v15.8b, v15.8b, v15.8b, #4
+1:          tbz         x3, #1, 1f
+            st1         {v15.h}[0], [x0], #2
+            ext         v15.8b, v15.8b, v15.8b, #2
+1:          tbz         x3, #0, 5f
+            st1         {v15.b}[0], [x0], #1
+            ext         v15.8b, v15.8b, v15.8b, #1
+5:          mov         x0, #0
+.endm
+
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+            stp         x29,x30, [sp, #-16]!
+
+            prefill     step=1, max_r=\r, label=.Lcnv1_\r
+
+            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+            ldp         x29,x30, [sp], #16
+            ret
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+            sub         x9, sp, #0x40
+            stp         x29,x30, [sp, #-(16 + 0x40 + 0x80)]!
+            bic         x9, x9, #0x7f
+
+            /* x9 now points to a 0x40 byte buffer on the stack whose address
+             * has the low 7 bits clear.  This allows easy address calculation
+             * in the wrap-around cases.
+             */
+
+            prefill     step=4, max_r=\r, label=.Lcnv4_\r
+
+            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+            ldp         x29,x30, [sp], #(16 + 0x40 + 0x80)
+            ret
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ *                  void *out,      // x0
+ *                  void *in,       // x1
+ *                  size_t w,       // x2
+ *                  size_t h,       // x3
+ *                  size_t p,       // x4
+ *                  size_t x,       // x5
+ *                  size_t y,       // x6
+ *                  size_t count,   // x7
+ *                  size_t r,       // [sp]
+ *                  uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+            stp         x19,x30, [sp, #-16]!
+            sub         x8, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            mov         x8, x5          // x
+            ldr         w5, [sp,#80]    // r
+            sub         x9, x2, x8      // w - x
+            sub         x10, x3, x6     // h - y
+            mov         x2, x4          // pitch
+            mov         x3, x7          // count
+            sub         x7, x10, #1     // h - y - 1
+            mov         x4, x9          // inlen = (w - x)
+
+            ldr         x12, [sp, #88] // tab
+
+            add         x1, x1, x8      // src += x
+
+            cmp         x6, x5
+            csel        x6, x5, x6, hs  // rup = min(r, y)
+            cmp         x7, x5
+            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
+
+            sub         x13, xzr, x2    // -pitch
+            msub        x15, x2, x6, x1
+            madd        x19, x2, x7, x1
+
+            ld1         {v0.8h,v1.8h}, [x12], #32
+            ld1         {v2.8h,v3.8h}, [x12], #32
+
+            adr         x30, 1f
+  .irp r, TUNED_LIST1
+            cmp         x5, #\r
+            bls         convolve1_\r
+  .endr
+            b           convolve1_25
+
+1:          ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldp         x19,x30, [sp], #16
+            ret
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ *                  void *out,      // x0
+ *                  void *in,       // x1
+ *                  size_t w,       // x2
+ *                  size_t h,       // x3
+ *                  size_t p,       // x4
+ *                  size_t x,       // x5
+ *                  size_t y,       // x6
+ *                  size_t count,   // x7
+ *                  size_t r,       // [sp]
+ *                  uint16_t *tab); // [sp,#8]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+            stp         x19,x30, [sp, #-16]!
+            sub         x8, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            lsl         x8, x5, #2      // x
+            lsl         x2, x2, #2
+            ldr         w5, [sp,#80]    // r
+            sub         x9, x2, x8      // w - x
+            sub         x10, x3, x6     // h - y
+            mov         x2, x4          // pitch
+            lsl         x3, x7, #2      // count
+            sub         x7, x10, #1     // h - y - 1
+            mov         x4, x9          // inlen = (w - x)
+
+            ldr         x12, [sp, #88]
+
+            add         x1, x1, x8      // in += x
+
+            cmp         x6, x5
+            csel        x6, x5, x6, hs  // rup = min(r, y)
+            cmp         x7, x5
+            csel        x7, x5, x7, hs  // rdn = min(r, h - y - 1)
+
+
+            sub         x13, xzr, x2
+            msub        x15, x2, x6, x1
+            madd        x19, x2, x7, x1
+
+            ld1         {v0.8h,v1.8h}, [x12], #32
+            ld1         {v2.8h,v3.8h}, [x12], #32
+
+            adr         x30, 1f
+  .irp r, TUNED_LIST4
+            cmp         x5, #\r
+            bls         convolve4_\r
+  .endr
+            b           convolve4_25
+
+1:          ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldp         x19,x30, [sp], #16
+            ret
+END(rsdIntrinsicBlurU4_K)
diff --git a/renderscript-toolkit/src/main/cpp/Blur_neon.S b/renderscript-toolkit/src/main/cpp/Blur_neon.S
new file mode 100644
index 0000000..241af5f
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Blur_neon.S
@@ -0,0 +1,1824 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+#define ARCH_ARM_USE_BLUR_PRELOAD
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Number of fractional bits to preserve in intermediate results.  The
+ * intermediate storage is 16-bit, and we started with 8 bit data (the integer
+ * part), so this should be between 0 and 8.
+ */
+.set FRACTION_BITS, 7
+
+.set MAX_R, 25
+
+
+/* A quick way of making a line of code conditional on some other condition.
+ * Use `.set cc, 1` or `.set cc, 0` to enable or disable lines prefixed with
+ * `ifcc`:
+ */
+.macro ifcc zzz:vararg
+.if cc
+            \zzz
+.endif
+.endm
+
+/* It's not always clear that prefetching is beneficial and this needs further
+ * testing on different cores, so it's made switchable here.
+ */
+#if defined(ARCH_ARM_USE_BLUR_PRELOAD)
+#define VERTPLD(...) pld [__VA_ARGS__]
+#else
+#define VERTPLD(...) nop
+#endif
+
+/* Fetch 16 columns of bytes (regardless of image format), convolve these
+ * vertically, and leave them in the register file.  If working near the top or
+ * bottom of an image then clamp the addressing while loading the data in.
+ *
+ * The convolution is fully unrolled for windows up to max_r, with the
+ * outermost edges calculated first.  This way it's possible to branch directly
+ * into the relevant part of the code for an arbitrary convolution radius.  Two
+ * variants of the loop are produced; one eliminates the clamping code for a
+ * slight speed advantage.
+ *
+ * Where the macro is called with reg=x, the specified register is taken to
+ * contain a pre-calculated pointer into one of the two loops.
+ *
+ * Input:
+ *      r1 -- src
+ *      r2 -- pitch
+ *      r5 -- r
+ *      r6 -- rup (r, unless clipped to top of source image)
+ *      r7 -- rdn (r, unless clipped to bottom of source image)
+ *      r12 -- switch index
+ *      q0-q3 -- coefficient table
+ * Output:
+ *      r1 += 16
+ *      q10,q11 -- 16 convolved columns
+ * Modifies:
+ *      r10 = upper row pointer
+ *      r11 = lower row pointer
+ *      q12-q15 = temporary sums
+ */
+.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=r12 /*{{{*/
+  .ifc \reg,r12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
+
+            vld1.8      {d30,d31}, [r1]
+            mls         r10, r2, r6, r1
+
+            vmovl.u8    q14, d30
+            VERTPLD(r1, #32)
+            vmovl.u8    q15, d31
+  .if \max_r < 16 // approximate
+    ifcc    adr         \reg, 1f
+  .else
+    ifcc    ldr         \reg, 2f
+1:  ifcc    add         \reg, \reg, pc
+  .endif
+
+            vmull.u16   q12, d28, d0[0]
+    ifcc    sub         \reg, r5, LSL #6
+            vmull.u16   q13, d29, d0[0]
+            mla         r11, r2, r7, r1
+            vmull.u16   q14, d30, d0[0]
+            add         r1, r1, #16
+            vmull.u16   q15, d31, d0[0]
+            bx          \reg
+
+     ifcc   .align 2
+  2: ifcc   .word       1f-1b-8
+
+  /* This version of the vertical fetch loop body is used away from the edges
+   * of the source image.  The pointers start at the top and bottom source rows
+   * and work their way towards the centre on each iteration.  This way the
+   * number of taps used can be controlled by jumping directly into the middle
+   * of the loop and running to completion.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_noclamp i, dreg
+    .if 0 < \i && \i <= \max_r
+            vld1.8      {d20,d21}, [r10], r2
+            vld1.8      {d22,d23}, [r11]
+            sub         r11, r11, r2
+            vswp        d21, d22
+            VERTPLD(r10, #32)
+            vaddl.u8    q10, d20, d21
+            vaddl.u8    q11, d22, d23
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            vmlal.u16   q14, d22, \dreg
+            vmlal.u16   q15, d23, \dreg
+    .endif
+  .endm
+
+  /* This version of the vertical fetch loop body is used near the edges of the
+   * source image, where one or both of the accesses may start with a clamped
+   * value, and the row addresses only begin to change after some number of
+   * iterations before the end.
+   * If the loop body changes size then the code which calculates the address of
+   * the initial iteration must be updated to accordingly.
+   */
+  .macro vertfetch_clamped i, dreg
+    .if 0 < \i && \i <= \max_r
+            vld1.8      {d20,d21}, [r10]
+            vld1.8      {d22,d23}, [r11]
+            cmp         r6, #\i
+            vswp        d21, d22
+            VERTPLD(r10, #32)
+            vaddl.u8    q10, d20, d21
+            addhs       r10, r10, r2
+            vaddl.u8    q11, d22, d23
+            cmp         r7, #\i
+            vmlal.u16   q12, d20, \dreg
+            VERTPLD(r11, #32)
+            vmlal.u16   q13, d21, \dreg
+            subhs       r11, r11, r2
+            vmlal.u16   q14, d22, \dreg
+            nop
+            vmlal.u16   q15, d23, \dreg
+    .endif
+  .endm
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelc at the end of the block.
+   */
+  .align 4
+  vertfetch_clamped 27, d6[3]
+  vertfetch_clamped 26, d6[2]
+  vertfetch_clamped 25, d6[1]
+  vertfetch_clamped 24, d6[0]
+  vertfetch_clamped 23, d5[3]
+  vertfetch_clamped 22, d5[2]
+  vertfetch_clamped 21, d5[1]
+  vertfetch_clamped 20, d5[0]
+  vertfetch_clamped 19, d4[3]
+  vertfetch_clamped 18, d4[2]
+  vertfetch_clamped 17, d4[1]
+  vertfetch_clamped 16, d4[0]
+  vertfetch_clamped 15, d3[3]
+  vertfetch_clamped 14, d3[2]
+  vertfetch_clamped 13, d3[1]
+  vertfetch_clamped 12, d3[0]
+  vertfetch_clamped 11, d2[3]
+  vertfetch_clamped 10, d2[2]
+  vertfetch_clamped  9, d2[1]
+  vertfetch_clamped  8, d2[0]
+  vertfetch_clamped  7, d1[3]
+  vertfetch_clamped  6, d1[2]
+  vertfetch_clamped  5, d1[1]
+  vertfetch_clamped  4, d1[0]
+  vertfetch_clamped  3, d0[3]
+  vertfetch_clamped  2, d0[2]
+  vertfetch_clamped  1, d0[1]
+  vertfetch_clamped  0, d0[0]
+  1:
+  \labelc : b 2f    /* done with clamped loop, skip over non-clamped loop */
+
+  /* Entry into this unrolled loop is computed as a negative index from
+   * \labelnc at the end of the block.
+   */
+  .align 4
+  vertfetch_noclamp 27, d6[3]
+  vertfetch_noclamp 26, d6[2]
+  vertfetch_noclamp 25, d6[1]
+  vertfetch_noclamp 24, d6[0]
+  vertfetch_noclamp 23, d5[3]
+  vertfetch_noclamp 22, d5[2]
+  vertfetch_noclamp 21, d5[1]
+  vertfetch_noclamp 20, d5[0]
+  vertfetch_noclamp 19, d4[3]
+  vertfetch_noclamp 18, d4[2]
+  vertfetch_noclamp 17, d4[1]
+  vertfetch_noclamp 16, d4[0]
+  vertfetch_noclamp 15, d3[3]
+  vertfetch_noclamp 14, d3[2]
+  vertfetch_noclamp 13, d3[1]
+  vertfetch_noclamp 12, d3[0]
+  vertfetch_noclamp 11, d2[3]
+  vertfetch_noclamp 10, d2[2]
+  vertfetch_noclamp  9, d2[1]
+  vertfetch_noclamp  8, d2[0]
+  vertfetch_noclamp  7, d1[3]
+  vertfetch_noclamp  6, d1[2]
+  vertfetch_noclamp  5, d1[1]
+  vertfetch_noclamp  4, d1[0]
+  vertfetch_noclamp  3, d0[3]
+  vertfetch_noclamp  2, d0[2]
+  vertfetch_noclamp  1, d0[1]
+  vertfetch_noclamp  0, d0[0]
+  \labelnc :
+
+  .purgem vertfetch_clamped
+  .purgem vertfetch_noclamp
+
+  2:        vqrshrn.u32 d20, q12, #16 - FRACTION_BITS
+            vqrshrn.u32 d21, q13, #16 - FRACTION_BITS
+            vqrshrn.u32 d22, q14, #16 - FRACTION_BITS
+            vqrshrn.u32 d23, q15, #16 - FRACTION_BITS
+.endm /*}}}*/
+
+/* Some portion of the convolution window (as much as will fit, and all of it
+ * for the uchar1 cases) is kept in the register file to avoid unnecessary
+ * memory accesses.  This forces the horizontal loops to be unrolled because
+ * there's no indexed addressing into the register file.
+ *
+ * As in the fetch macro, the operations are ordered from outside to inside, so
+ * that jumping into the middle of the block bypasses the unwanted window taps.
+ *
+ * There are several variants of the macro because of the fixed offets of the
+ * taps -- the wider the maximum radius the further the centre tap is from the
+ * most recently fetched data.  This means that pre-filling the window requires
+ * more data that won't be used and it means that rotating the window involves
+ * more mov operations.
+ *
+ * When the buffer gets too big the buffer at [r9] is used.
+ *
+ * Input:
+ *      q4-q11 -- convoltion window
+ *      r9 -- pointer to additional convolution window data
+ * Output:
+ *      r9 -- updated buffer pointer (if used)
+ *      d31 -- result to be stored
+ * Modifies:
+ *      r12 -- temp buffer pointer
+ *      q12-q13 -- temporaries for load and vext operations.
+ *      q14-q15 -- intermediate sums
+ */
+#define TUNED_LIST1 8, 16
+.macro hconv1_8/*{{{*/
+            vmull.u16   q14, d18, d0[0]
+            vmull.u16   q15, d19, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+    108:    vmlal.u16   q14, d16, d2[0]
+            vmlal.u16   q15, d17, d2[0]
+            vmlal.u16   q14, d20, d2[0]
+            vmlal.u16   q15, d21, d2[0]
+    107:    vext.u16    q12, q8, q9, #1
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q8, q9, #2
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q8, q9, #3
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    //vext.u16    q12, q8, q9, #4
+            //vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d17, d1[0]
+            vmlal.u16   q15, d18, d1[0]
+            vmlal.u16   q14, d19, d1[0]
+            vmlal.u16   q15, d20, d1[0]
+    103:    vext.u16    q12, q8, q9, #5
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q8, q9, #6
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q8, q9, #7
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv1_16/*{{{*/
+            vmull.u16   q14, d16, d0[0]
+            vmull.u16   q15, d17, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+    116:    //vext.u16    q12, q6, q7, #0
+            //vext.u16    q13, q10, q11, #0
+            vmlal.u16   q14, d12, d4[0]
+            vmlal.u16   q15, d13, d4[0]
+            vmlal.u16   q14, d20, d4[0]
+            vmlal.u16   q15, d21, d4[0]
+    115:    vext.u16    q12, q6, q7, #1
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d26, d3[3]
+            vmlal.u16   q15, d27, d3[3]
+    114:    vext.u16    q12, q6, q7, #2
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d26, d3[2]
+            vmlal.u16   q15, d27, d3[2]
+    113:    vext.u16    q12, q6, q7, #3
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d26, d3[1]
+            vmlal.u16   q15, d27, d3[1]
+    112:    //vext.u16    q12, q6, q7, #4
+            //vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d13, d3[0]
+            vmlal.u16   q15, d14, d3[0]
+            vmlal.u16   q14, d19, d3[0]
+            vmlal.u16   q15, d20, d3[0]
+    111:    vext.u16    q12, q6, q7, #5
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    vext.u16    q12, q6, q7, #6
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    vext.u16    q12, q6, q7, #7
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    //vext.u16    q12, q7, q8, #0
+            //vext.u16    q13, q9, q10, #0
+            vmlal.u16   q14, d14, d2[0]
+            vmlal.u16   q15, d15, d2[0]
+            vmlal.u16   q14, d18, d2[0]
+            vmlal.u16   q15, d19, d2[0]
+    107:    vext.u16    q12, q7, q8, #1
+            vext.u16    q13, q8, q9, #7
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q7, q8, #2
+            vext.u16    q13, q8, q9, #6
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q7, q8, #3
+            vext.u16    q13, q8, q9, #5
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    //vext.u16    q12, q7, q8, #4
+            //vext.u16    q13, q8, q9, #4
+            vmlal.u16   q14, d15, d1[0]
+            vmlal.u16   q15, d16, d1[0]
+            vmlal.u16   q14, d17, d1[0]
+            vmlal.u16   q15, d18, d1[0]
+    103:    vext.u16    q12, q7, q8, #5
+            vext.u16    q13, q8, q9, #3
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q7, q8, #6
+            vext.u16    q13, q8, q9, #2
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q7, q8, #7
+            vext.u16    q13, q8, q9, #1
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv1_25/*{{{*/
+            vext.u16    q12, q6, q7, #7
+            vmull.u16   q14, d24, d0[0]
+            vmull.u16   q15, d25, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+            .word 117f-100b
+            .word 118f-100b
+            .word 119f-100b
+            .word 120f-100b
+            .word 121f-100b
+            .word 122f-100b
+            .word 123f-100b
+            .word 124f-100b
+            .word 125f-100b
+    125:    vext.u16    q12, q3, q4, #6
+            vext.u16    q13, q10, q11, #0
+            vmlal.u16   q14, d24, d6[1]
+            vmlal.u16   q15, d25, d6[1]
+            vmlal.u16   q14, d26, d6[1]
+            vmlal.u16   q15, d27, d6[1]
+    124:    vext.u16    q12, q3, q4, #7
+            vext.u16    q13, q9, q10, #7
+            vmlal.u16   q14, d24, d6[0]
+            vmlal.u16   q15, d25, d6[0]
+            vmlal.u16   q14, d26, d6[0]
+            vmlal.u16   q15, d27, d6[0]
+    123:    vext.u16    q12, q4, q5, #0
+            vext.u16    q13, q9, q10, #6
+            vmlal.u16   q14, d24, d5[3]
+            vmlal.u16   q15, d25, d5[3]
+            vmlal.u16   q14, d26, d5[3]
+            vmlal.u16   q15, d27, d5[3]
+    122:    vext.u16    q12, q4, q5, #1
+            vext.u16    q13, q9, q10, #5
+            vmlal.u16   q14, d24, d5[2]
+            vmlal.u16   q15, d25, d5[2]
+            vmlal.u16   q14, d26, d5[2]
+            vmlal.u16   q15, d27, d5[2]
+    121:    vext.u16    q12, q4, q5, #2
+            vext.u16    q13, q9, q10, #4
+            vmlal.u16   q14, d24, d5[1]
+            vmlal.u16   q15, d25, d5[1]
+            vmlal.u16   q14, d26, d5[1]
+            vmlal.u16   q15, d27, d5[1]
+    120:    vext.u16    q12, q4, q5, #3
+            vext.u16    q13, q9, q10, #3
+            vmlal.u16   q14, d24, d5[0]
+            vmlal.u16   q15, d25, d5[0]
+            vmlal.u16   q14, d26, d5[0]
+            vmlal.u16   q15, d27, d5[0]
+    119:    vext.u16    q12, q4, q5, #4
+            vext.u16    q13, q9, q10, #2
+            vmlal.u16   q14, d24, d4[3]
+            vmlal.u16   q15, d25, d4[3]
+            vmlal.u16   q14, d26, d4[3]
+            vmlal.u16   q15, d27, d4[3]
+    118:    vext.u16    q12, q4, q5, #5
+            vext.u16    q13, q9, q10, #1
+            vmlal.u16   q14, d24, d4[2]
+            vmlal.u16   q15, d25, d4[2]
+            vmlal.u16   q14, d26, d4[2]
+            vmlal.u16   q15, d27, d4[2]
+    117:    vext.u16    q12, q4, q5, #6
+            vext.u16    q13, q9, q10, #0
+            vmlal.u16   q14, d24, d4[1]
+            vmlal.u16   q15, d25, d4[1]
+            vmlal.u16   q14, d26, d4[1]
+            vmlal.u16   q15, d27, d4[1]
+    116:    vext.u16    q12, q4, q5, #7
+            vext.u16    q13, q8, q9, #7
+            vmlal.u16   q14, d24, d4[0]
+            vmlal.u16   q15, d25, d4[0]
+            vmlal.u16   q14, d26, d4[0]
+            vmlal.u16   q15, d27, d4[0]
+    115:    vext.u16    q12, q5, q6, #0
+            vext.u16    q13, q8, q9, #6
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d26, d3[3]
+            vmlal.u16   q15, d27, d3[3]
+    114:    vext.u16    q12, q5, q6, #1
+            vext.u16    q13, q8, q9, #5
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d26, d3[2]
+            vmlal.u16   q15, d27, d3[2]
+    113:    vext.u16    q12, q5, q6, #2
+            vext.u16    q13, q8, q9, #4
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d26, d3[1]
+            vmlal.u16   q15, d27, d3[1]
+    112:    vext.u16    q12, q5, q6, #3
+            vext.u16    q13, q8, q9, #3
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d26, d3[0]
+            vmlal.u16   q15, d27, d3[0]
+    111:    vext.u16    q12, q5, q6, #4
+            vext.u16    q13, q8, q9, #2
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    vext.u16    q12, q5, q6, #5
+            vext.u16    q13, q8, q9, #1
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    vext.u16    q12, q5, q6, #6
+            vext.u16    q13, q8, q9, #0
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    vext.u16    q12, q5, q6, #7
+            vext.u16    q13, q7, q8, #7
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d26, d2[0]
+            vmlal.u16   q15, d27, d2[0]
+    107:    vext.u16    q12, q6, q7, #0
+            vext.u16    q13, q7, q8, #6
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    vext.u16    q12, q6, q7, #1
+            vext.u16    q13, q7, q8, #5
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    vext.u16    q12, q6, q7, #2
+            vext.u16    q13, q7, q8, #4
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    vext.u16    q12, q6, q7, #3
+            vext.u16    q13, q7, q8, #3
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d26, d1[0]
+            vmlal.u16   q15, d27, d1[0]
+    103:    vext.u16    q12, q6, q7, #4
+            vext.u16    q13, q7, q8, #2
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    vext.u16    q12, q6, q7, #5
+            vext.u16    q13, q7, q8, #1
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    vext.u16    q12, q6, q7, #6
+            vext.u16    q13, q7, q8, #0
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        d7, d9
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+#define TUNED_LIST4 6, 12
+.macro hconv4_6/*{{{*/
+            vmull.u16   q14, d14, d0[0]
+            vmull.u16   q15, d15, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+    106:    vmlal.u16   q14, d8,  d1[2]
+            vmlal.u16   q15, d9,  d1[2]
+            vmlal.u16   q14, d20, d1[2]
+            vmlal.u16   q15, d21, d1[2]
+    105:    vmlal.u16   q14, d9,  d1[1]
+            vmlal.u16   q15, d10, d1[1]
+            vmlal.u16   q14, d19, d1[1]
+            vmlal.u16   q15, d20, d1[1]
+    104:    vmlal.u16   q14, d10, d1[0]
+            vmlal.u16   q15, d11, d1[0]
+            vmlal.u16   q14, d18, d1[0]
+            vmlal.u16   q15, d19, d1[0]
+    103:    vmlal.u16   q14, d11, d0[3]
+            vmlal.u16   q15, d12, d0[3]
+            vmlal.u16   q14, d17, d0[3]
+            vmlal.u16   q15, d18, d0[3]
+    102:    vmlal.u16   q14, d12, d0[2]
+            vmlal.u16   q15, d13, d0[2]
+            vmlal.u16   q14, d16, d0[2]
+            vmlal.u16   q15, d17, d0[2]
+    101:    vmlal.u16   q14, d13, d0[1]
+            vmlal.u16   q15, d14, d0[1]
+            vmlal.u16   q14, d15, d0[1]
+            vmlal.u16   q15, d16, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv4_12/*{{{*/
+            vmull.u16   q14, d8, d0[0]
+            vmull.u16   q15, d9, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+    112:    add         r12, r9, #0x1a0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d20, d3[0]
+            vmlal.u16   q15, d21, d3[0]
+    111:    add         r12, r9, #0x1a8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d19, d2[3]
+            vmlal.u16   q15, d20, d2[3]
+    110:    add         r12, r9, #0x1b0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d18, d2[2]
+            vmlal.u16   q15, d19, d2[2]
+    109:    add         r12, r9, #0x1b8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d17, d2[1]
+            vmlal.u16   q15, d18, d2[1]
+    108:    add         r12, r9, #0x1c0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d16, d2[0]
+            vmlal.u16   q15, d17, d2[0]
+    107:    add         r12, r9, #0x1c8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d15, d1[3]
+            vmlal.u16   q15, d16, d1[3]
+    106:    add         r12, r9, #0x1d0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d14, d1[2]
+            vmlal.u16   q15, d15, d1[2]
+    105:    add         r12, r9, #0x1d8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d13, d1[1]
+            vmlal.u16   q15, d14, d1[1]
+    104:    add         r12, r9, #0x1e0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d12, d1[0]
+            vmlal.u16   q15, d13, d1[0]
+    103:    add         r12, r9, #0x1e8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d11, d0[3]
+            vmlal.u16   q15, d12, d0[3]
+    102:    add         r12, r9, #0x1f0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d10, d0[2]
+            vmlal.u16   q15, d11, d0[2]
+    101:    add         r12, r9, #0x1f8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d8,  d0[1]
+            vmlal.u16   q14, d9,  d0[1]
+            vmlal.u16   q15, d10, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vst1.u8     {q4}, [r9:128]!
+            bic         r9, r9, #0x200
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+.macro hconv4_25/*{{{*/
+            add         r12, r9, #0x198
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12:64]
+            vmull.u16   q14, d24, d0[0]
+            vmull.u16   q15, d25, d0[0]
+
+            ldr         r12, [pc, r5, LSL #2]
+            add         pc, pc, r12
+            bkpt
+    100:    .word 101f-100b
+            .word 102f-100b
+            .word 103f-100b
+            .word 104f-100b
+            .word 105f-100b
+            .word 106f-100b
+            .word 107f-100b
+            .word 108f-100b
+            .word 109f-100b
+            .word 110f-100b
+            .word 111f-100b
+            .word 112f-100b
+            .word 113f-100b
+            .word 114f-100b
+            .word 115f-100b
+            .word 116f-100b
+            .word 117f-100b
+            .word 118f-100b
+            .word 119f-100b
+            .word 120f-100b
+            .word 121f-100b
+            .word 122f-100b
+            .word 123f-100b
+            .word 124f-100b
+            .word 125f-100b
+    125:    add         r12, r9, #0x0d0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d6[1]
+            vmlal.u16   q15, d25, d6[1]
+            vmlal.u16   q14, d20, d6[1]
+            vmlal.u16   q15, d21, d6[1]
+    124:    add         r12, r9, #0x0d8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d6[0]
+            vmlal.u16   q15, d25, d6[0]
+            vmlal.u16   q14, d19, d6[0]
+            vmlal.u16   q15, d20, d6[0]
+    123:    add         r12, r9, #0x0e0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d5[3]
+            vmlal.u16   q15, d25, d5[3]
+            vmlal.u16   q14, d18, d5[3]
+            vmlal.u16   q15, d19, d5[3]
+    122:    add         r12, r9, #0x0e8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d5[2]
+            vmlal.u16   q15, d25, d5[2]
+            vmlal.u16   q14, d17, d5[2]
+            vmlal.u16   q15, d18, d5[2]
+    121:    add         r12, r9, #0x0f0
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d5[1]
+            vmlal.u16   q15, d25, d5[1]
+            vmlal.u16   q14, d16, d5[1]
+            vmlal.u16   q15, d17, d5[1]
+    120:    add         r12, r9, #0x0f8
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d5[0]
+            vmlal.u16   q15, d25, d5[0]
+            vmlal.u16   q14, d15, d5[0]
+            vmlal.u16   q15, d16, d5[0]
+    119:    add         r12, r9, #0x100
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d4[3]
+            vmlal.u16   q15, d25, d4[3]
+            vmlal.u16   q14, d14, d4[3]
+            vmlal.u16   q15, d15, d4[3]
+    118:    add         r12, r9, #0x108
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d4[2]
+            vmlal.u16   q15, d25, d4[2]
+            vmlal.u16   q14, d13, d4[2]
+            vmlal.u16   q15, d14, d4[2]
+    117:    add         r12, r9, #0x110
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d4[1]
+            vmlal.u16   q15, d25, d4[1]
+            vmlal.u16   q14, d12, d4[1]
+            vmlal.u16   q15, d13, d4[1]
+    116:    add         r12, r9, #0x118
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d4[0]
+            vmlal.u16   q15, d25, d4[0]
+            vmlal.u16   q14, d11, d4[0]
+            vmlal.u16   q15, d12, d4[0]
+    115:    add         r12, r9, #0x120
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[3]
+            vmlal.u16   q15, d25, d3[3]
+            vmlal.u16   q14, d10, d3[3]
+            vmlal.u16   q15, d11, d3[3]
+    114:    add         r12, r9, #0x128
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+            vmlal.u16   q14, d24, d3[2]
+            vmlal.u16   q15, d25, d3[2]
+            vmlal.u16   q14, d9,  d3[2]
+            vmlal.u16   q15, d10, d3[2]
+    113:    add         r12, r9, #0x130
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+            vmlal.u16   q14, d24, d3[1]
+            vmlal.u16   q15, d25, d3[1]
+            vmlal.u16   q14, d8,  d3[1]
+            vmlal.u16   q15, d9,  d3[1]
+    112:    add         r12, r9, #0x138
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1f8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]
+            vmlal.u16   q14, d24, d3[0]
+            vmlal.u16   q15, d25, d3[0]
+            vmlal.u16   q14, d26, d3[0]   @ Could be d7, without the load, right?
+            vmlal.u16   q15, d8,  d3[0]
+    111:    add         r12, r9, #0x140
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1f0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d2[3]
+            vmlal.u16   q15, d25, d2[3]
+            vmlal.u16   q14, d26, d2[3]
+            vmlal.u16   q15, d27, d2[3]
+    110:    add         r12, r9, #0x148
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1e8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d2[2]
+            vmlal.u16   q15, d25, d2[2]
+            vmlal.u16   q14, d26, d2[2]
+            vmlal.u16   q15, d27, d2[2]
+    109:    add         r12, r9, #0x150
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1e0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d2[1]
+            vmlal.u16   q15, d25, d2[1]
+            vmlal.u16   q14, d26, d2[1]
+            vmlal.u16   q15, d27, d2[1]
+    108:    add         r12, r9, #0x158
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1d8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d2[0]
+            vmlal.u16   q15, d25, d2[0]
+            vmlal.u16   q14, d26, d2[0]
+            vmlal.u16   q15, d27, d2[0]
+    107:    add         r12, r9, #0x160
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1d0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d1[3]
+            vmlal.u16   q15, d25, d1[3]
+            vmlal.u16   q14, d26, d1[3]
+            vmlal.u16   q15, d27, d1[3]
+    106:    add         r12, r9, #0x168
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1c8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d1[2]
+            vmlal.u16   q15, d25, d1[2]
+            vmlal.u16   q14, d26, d1[2]
+            vmlal.u16   q15, d27, d1[2]
+    105:    add         r12, r9, #0x170
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1c0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d1[1]
+            vmlal.u16   q15, d25, d1[1]
+            vmlal.u16   q14, d26, d1[1]
+            vmlal.u16   q15, d27, d1[1]
+    104:    add         r12, r9, #0x178
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1b8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d1[0]
+            vmlal.u16   q15, d25, d1[0]
+            vmlal.u16   q14, d26, d1[0]
+            vmlal.u16   q15, d27, d1[0]
+    103:    add         r12, r9, #0x180
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]
+                                            add         r12, r9, #0x1b0
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d0[3]
+            vmlal.u16   q15, d25, d0[3]
+            vmlal.u16   q14, d26, d0[3]
+            vmlal.u16   q15, d27, d0[3]
+    102:    add         r12, r9, #0x188
+            bic         r12, r12, #0x200
+            vld1.u16    {d24}, [r12:64]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d25}, [r12]
+                                            add         r12, r9, #0x1a8
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d26}, [r12:64]!
+                                            bic         r12, r12, #0x200
+                                            vld1.u16    {d27}, [r12:64]
+            vmlal.u16   q14, d24, d0[2]
+            vmlal.u16   q15, d25, d0[2]
+            vmlal.u16   q14, d26, d0[2]
+            vmlal.u16   q15, d27, d0[2]
+    101:    add         r12, r9, #0x190
+            bic         r12, r12, #0x200
+            vld1.u16    {d24,d25}, [r12:128]!
+            bic         r12, r12, #0x200
+            vld1.u16    {d26,d27}, [r12:128]
+            vmlal.u16   q14, d24, d0[1]
+            vmlal.u16   q15, d25, d0[1]
+            vmlal.u16   q14, d26, d0[1]
+            vmlal.u16   q15, d27, d0[1]
+
+            vqrshrn.u32 d28, q14, #16
+            vqrshrn.u32 d29, q15, #16
+            vqrshrn.u16 d31, q14, #FRACTION_BITS
+
+            vst1.u8     {q4}, [r9:128]!
+            bic         r9, r9, #0x200
+            vmov        q4, q5
+            vmov        q5, q6
+            vmov        q6, q7
+            vmov        q7, q8
+            vmov        q8, q9
+            vmov        q9, q10
+            vmov        q10, q11
+.endm/*}}}*/
+
+/* Dedicated function wrapper for the fetch macro, for the cases where
+ * performance isn't that important, to keep code size down.
+ */
+PRIVATE(fetch_generic_asm)
+            push        {r10,r11}
+            fetch
+            pop         {r10,r11}
+            bx          lr
+END(fetch_generic_asm)
+
+
+/* Fetch the next (16 - (r10 & 15)) columns of data, avoiding reading memory
+ * beyond that limit, and filling the rest of the vector with the last legal
+ * pixel.
+ * Result is in q10 and q11.  q8 and q9 are filled with the first legal pixel.
+ * Note: This function can read beyond the right edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampleft1)
+            push        {r12,lr}
+            bl          fetch_generic_asm
+            vdup.u16    q8, d20[0]
+            vdup.u16    q9, d20[0]
+            ands        r12, r10, #15
+            beq         1f
+            sub         r1, r1, r12
+            sub         r10, r10, r12
+            sub         sp, sp, #32
+            vst1.u16    {q10,q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #32
+            vst1.u16    {q8,q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+1:          pop         {r12,pc}
+END(fetch_clampleft1)
+
+PRIVATE(fetch_clampleft4)
+            push        {r12,lr}
+            bl          fetch_generic_asm
+            vmov.u16    d16, d20
+            vmov.u16    d17, d20
+            vmov.u16    d18, d20
+            vmov.u16    d19, d20
+            ands        r12, r10, #15
+            beq         1f
+            sub         r1, r1, r12
+            sub         r10, r10, r12
+            sub         sp, sp, #32
+            vst1.u16    {q10-q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #32
+            vst1.u16    {q8,q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+1:          pop         {r12,pc}
+END(fetch_clampleft4)
+
+/* Fetch only the next (r11 & 15) (where 0 means 16) columns of data, avoiding
+ * reading memory beyond that limit, and filling the rest of the vector with
+ * the last legal pixel.
+ * Result is in q10 and q11.  q12 and q13 are filled with the last legal pixel.
+ * Note: This function can read beyond the left edge of input if the image is
+ * narrower than 16 bytes.
+ */
+PRIVATE(fetch_clampright1)
+            push        {r12, lr}
+            rsb         r12, r11, #0
+            ands        r12, r12, #15
+            beq         1f
+            sub         r1, r1, r12
+            bl          fetch_generic_asm
+            vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            rsb         r12, r11, #0
+            and         r12, r12, #15
+            sub         sp, sp, #32
+            vst1.u16    {q12,q13}, [sp]
+            sub         sp, sp, #32
+            add         r12, sp, r12, LSL #1
+            vst1.u16    {q10,q11}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            pop         {r12,pc}
+1:          bl          fetch_generic_asm
+            vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            pop         {r12,pc}
+END(fetch_clampright1)
+
+PRIVATE(fetch_clampright4)
+            push        {r12, lr}
+            rsb         r12, r11, #0
+            ands        r12, r12, #15
+            beq         1f
+            sub         r1, r1, r12
+            bl          fetch_generic_asm
+            vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            rsb         r12, r11, #0
+            and         r12, r12, #15
+            sub         sp, sp, #32
+            vst1.u16    {q12-q13}, [sp]
+            sub         sp, sp, #32
+            add         r12, sp, r12, LSL #1
+            vst1.u16    {q10,q11}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            pop         {r12,pc}
+1:          bl          fetch_generic_asm
+            vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            pop         {r12,pc}
+END(fetch_clampright4)
+
+/* Given values in q10 and q11, and an index in r11, sweep the (r11 & 15)th
+ * value across to fill the rest of the register pair.  Used for filling the
+ * right hand edge of the window when reading too close to the right hand edge
+ * of the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
+ */
+PRIVATE(prefill_sweepright1)
+            ands        r12, r11, #15
+            beq         1f
+            sub         r12, r12, #1
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u16    {d24[],d25[]}, [r12]
+            vld1.u16    {d26[],d27[]}, [r12]
+            vst1.u16    {q12,q13}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vdup.u16    q12, d23[3]
+            vdup.u16    q13, d23[3]
+            bx          lr
+END(prefill_sweepright1)
+
+PRIVATE(prefill_sweepright4)
+            ands        r12, r11, #15
+            beq         1f
+            sub         r12, r12, #4
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u64    {d24}, [r12]
+            vld1.u64    {d25}, [r12]
+            vld1.u64    {d26}, [r12]
+            vld1.u64    {d27}, [r12]
+            vst1.u16    {q12,q13}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vmov.u16    d24, d23
+            vmov.u16    d25, d23
+            vmov.u16    d26, d23
+            vmov.u16    d27, d23
+            bx          lr
+END(prefill_sweepright4)
+
+/* The main loop keeps a sliding window of data that has already been convolved
+ * in the vertical axis for the current line.  This usually stays in the
+ * register file, but spills to memory for large windows.  The first thing that
+ * needs to be done at start-up is to fill this window with image data, taking
+ * into account the padding needed if the left or right edges of the image fall
+ * within this window.
+ */
+
+/* Because the window is in the register file writes to it cannot be indexed
+ * by another register.  Consequently the fill loops are unrolled to address
+ * the registers directly.  This macro distinguishes between writes to the
+ * register file and writes to the spill buffer (indicated by a destination
+ * register named xx).
+ */
+.macro prefill_out ra, rb, sra, srb, srb_hi
+  .ifc \ra,xx
+    .ifc \rb,xx
+            vst1.u16    {\sra,\srb}, [r9:128]!
+    .else
+            /* this case is used only for the last tap of uchar1 r=25 */
+            /* discard \sra */
+            vmov.u16    \rb, \srb_hi
+    .endif
+  .else
+    .ifnc \ra,\sra
+            vmov.u16    \ra, \sra
+    .endif
+    .ifnc \rb,\srb
+            vmov.u16    \rb, \srb
+    .endif
+  .endif
+.endm
+
+/* This macro provides the list of registers representing the window, and the
+ * cases where the register file is too small and a spill buffer is used
+ * instead.
+ * Since several specialisations of each function are generated, this also
+ * culls superfluous iterations, and sets the variable `i` for subsequent
+ * macros indicating the current index into the window.
+ */
+.macro prefill_list, macro, nextmacro, max_r, step, label
+  .macro ifneeded macro, nextmacro, line, nextline, ra, rb, step, label
+    .if windowsize >= (\line * 16)
+      .set i, windowsize - (\line * 16)
+\label\macro\line:
+            prefill_\macro \label\nextmacro\line, \label\nextmacro\nextline, \ra, \rb, \step
+    .endif
+  .endm
+  .if \step > 1
+            ifneeded \macro \nextmacro, 13, 12, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 12, 11, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 11, 10, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro, 10,  9, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  9,  8, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  8,  7, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  7,  6, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  6,  5, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  5,  4, xx, xx,  \step, \label
+            ifneeded \macro \nextmacro,  4,  3, xx, xx,  \step, \label
+  .else
+            /* q3 normally contains the coefficient table, but it's not fully
+             * used.  In the uchar1, r=25 case the other half of q3 is used for
+             * the last two window taps to avoid falling out to memory.
+             */
+            ifneeded \macro \nextmacro,  4,  3, xx, d7,   \step, \label
+  .endif
+            ifneeded \macro \nextmacro,  3,  2, q4, q5,   \step, \label
+            ifneeded \macro \nextmacro,  2,  1, q6, q7,   \step, \label
+            ifneeded \macro \nextmacro,  1,  0, q8, q9,   \step, \label
+
+\label\macro\()0:
+            b           \label\()_end
+  .purgem ifneeded
+.endm
+
+/* These macros represent the possible stages of filling the window.
+ * Each macro is unrolled enough times that it can fill the entire window
+ * itself, but normally it will have to hand control to subsequent macros
+ * part-way through and this is done using labels named \next and \after, where
+ * \next is the next macro starting at the same window position and \after is
+ * the next macro starting after the current window position.
+ */
+
+/* leftfill: v8 and v9 contain the left padding value.  While the window
+ * extends outside of the image on the left-hand side, and at least 16 more
+ * padding values are needed in the window, store v8 and v9 into the window.
+ * Otherwise skip forward to storing image data.
+ */
+.macro prefill_leftfill, next, after, ra, rb, step
+            cmp         r10, #i+16
+            blo         \next
+            prefill_out \ra, \rb, q8, q9, d19
+.endm
+
+/* leftedge: The very first non-fill or partial-fill chunk from the image is
+ * already loaded (as it was used to calculate the left padding value), so
+ * store it here, and then drop into the regular load/store cycle in the next
+ * macro.
+ */
+.macro prefill_leftedge, next, after, ra, rb, step
+1:          prefill_out \ra, \rb, q10, q11, d23
+            b           \after
+.endm
+
+/* dofetch: Copy chunks of the image into the window without any complications
+ * from edge conditions.
+ */
+.macro prefill_dofetch, next, after, ra, rb, step
+            cmp         r11, #i+16
+            bls         \next
+            bl          fetch_generic_asm
+            prefill_out \ra, \rb, q10, q11, d23
+.endm
+
+/* rightedge: The last fetch (currently in v10 and v11) may have gone beyond
+ * the right-hand edge of the image.  In that case sweep the last valid pixel
+ * across the rest of the chunk, and in either case prepare padding data in v12
+ * and v13 for the next macro.  This is done in fetch_clampright.
+ * This only happens once before going on to the next macro.
+ * Sometimes leftedge also covers the rightedge case, in which case this has
+ * to be skipped altogether.
+ */
+.macro prefill_rightedge, next, after, ra, rb, step
+            cmp         r11, #i
+            bls         \next
+            bl          fetch_clampright\step
+            prefill_out \ra, \rb, q10, q11, d23
+            b           \after
+.endm
+
+/* rightfill: The rest of the window is simply filled with right padding from
+ * v12 and v13.
+ */
+.macro prefill_rightfill, next, after, ra, rb, step
+            prefill_out \ra, \rb, q12, q13, d25
+.endm
+
+/* Here all of the macros above are unrolled and laid out in the proper order.
+ */
+.macro prefill_body, max_r, step, label
+            prefill_list leftfill,  leftedge,   \max_r, \step, \label
+            prefill_list leftedge,  dofetch,    \max_r, \step, \label
+            prefill_list dofetch,   rightedge,  \max_r, \step, \label
+            prefill_list rightedge, rightfill,  \max_r, \step, \label
+            prefill_list rightfill, oops,       \max_r, \step, \label
+\label\()_end:
+.endm
+
+/* Fill the convolution window with context data.  The aim here is to load
+ * exactly 2*r columns, and in the main loop to read as many columns as will be
+ * written.  This is complicated by the window being divided into chunks at
+ * register boundaries, and the need to handle cases when the input starts very
+ * close to the left or right (or both) edges of the image and the need to fill
+ * the spaces that leaves with left and right edge padding values.
+ *
+ * Input:
+ *      r1 -- src
+ *      r2 -- pitch
+ *      r3 -- count
+ *      r4 -- available image data right of src pointer
+ *      r5 -- r
+ *      r6 -- rup
+ *      r7 -- rdn
+ *      r8 -- available image data left of src pointer
+ *      r9 -- buffer (if needed)
+ * Output:
+ *      r4 -= min(inlen, count + windowsize - centertap)
+ *      r1 += min(inlen, count + windowsize - centertap)
+ * Modifies:
+ *      r10 -- fill start index in the window
+ *      r11 -- fill stop index in the window
+ *      r12 -- scratch
+ */
+.macro prefill step=1, max_r=25, label=xx
+.set windowsize, (((\max_r + \max_r) * \step + 15) & ~15)
+.set centertap, (windowsize - \max_r * \step)
+            mov         r10, #centertap
+            subs        r10, r10, r8
+            movlo       r10, #0
+
+            subs        r11, r4, #windowsize - centertap
+            movhs       r11, #0
+            add         r11, r11, #windowsize
+
+            /* r10 indicates where in the window legal image data begins.
+             * r11 indicates where in the window legal image date ends.
+             * When starting near the centre of a large image these would be
+             * zero and windowsize respectively, but when starting near the
+             * edges this can change.
+             * When starting on the leftmost pixel, r10 will be centertap.
+             * When starting on the rightmost pixel, r11 will be centertap+1.
+             */
+
+            /* r4 indicates how much data there is between the current pointers
+             * and the right edge of the image.  The pointers currently point
+             * to the data needed at centertap.  The subsequent code will
+             * consume (windowsize - r10) data, but only the data from
+             * centertap to windowsize comes out of r4's budget.
+             */
+1:          subs        r4, r4, #windowsize - centertap
+            movlo       r4, #0
+
+            /* And the pointers need to rewind to the start of the window.
+             */
+            sub         r1, r1, #centertap
+
+            /* Unless x8 indicated that there wasn't that much data available.
+             */
+            add         r1, r1, r10
+
+
+            /* Get the first chunk, and add padding to align it to the window
+             * if necessary.
+             */
+            bl          fetch_clampleft\step
+
+            /* Sometimes the start and the end of the window are in the same
+             * chunk.  In that case both ends need filler at the outset.
+             */
+            sub         r12, r11, #1
+            eor         r12,  r10, r12
+            cmp         r12, #16
+            bllo        prefill_sweepright\step
+
+            /* Iterate through all the points in the window and fill them in
+             * with padding or image data as needed.
+             */
+            prefill_body \max_r, \step, \label
+.endm
+
+/* The main body of the convolve functions.  Having already pre-filled the
+ * convolution window with 2*r input values, the logic settles into a regular
+ * pattern of reading and writing at a 1:1 rate until either input or output
+ * expires.  The input leads the output by r values, so when processing all the
+ * way to the right-hand edge, or within r pixels of that edge, the input will
+ * run out first.  In the case of very narrow images, or sub-windows starting
+ * near the right edge, the input may already have run out while the
+ * convolution window was being filled and this loop will start with a
+ * zero-length input.
+ *
+ * Once the input runs out, the rest of the output must be processed by padding
+ * the remainder of the window with pad value from the last valid pixel from
+ * the source.
+ *
+ * Input:
+ *      r0 = dst
+ *      r1 = src
+ *      r2 = pitch
+ *      r3 = count
+ *      r4 = inlen
+ *      r5 = r
+ *      r6 = rup
+ *      r7 = rdn
+ *      r9 = buffer
+ * Modifies
+ *      r8 = fetch code pointer
+ */
+.macro conv_body core, step=1, max_r=25, labelc="", labelnc=""
+
+            /* If x4 >= x3 then there's no need for clipping.  The main loop
+             * needs to exit when either x3 or x4 runs out, so clamp x4 to be
+             * no greater than x3 and use x4 for the loop.
+             * However, if x4 comes out of the loop with less than 16 bytes
+             * left, a partial read would be necessary to avoid reading beyond
+             * the end of the image.  To avoid this, clamp x4 to the next
+             * multiple of 16, which is still sufficient to force it out of the
+             * loop but doesn't imply a rewind.
+             */
+            add         r12, r3, #15
+            bic         r12, r12, #15
+            cmp         r4, r12
+            movhi       r4, r12
+
+            /* First calculate the entry-point into the internal fetch logic.
+             * This is done so the same function can service several kernel
+             * sizes.
+             */
+            ldr         r8, 3f
+1:          add         r8, r8, pc
+            sub         r8, r5, LSL #5
+            sub         r8, r5, LSL #4
+            cmp         r5, r6
+            cmpeq       r5, r7
+            beq         5f
+
+            /* if (r != rup || r != rdn) then the address-clamping table should
+             * be used rather than the short-cut version.
+             */
+            ldr         r8, 3f+4
+2:          add         r8, r8, pc
+            sub         r8, r5, LSL #6
+            b           5f
+            .align 3
+3:          .word       \labelnc-1b-8
+            .word       \labelc-2b-8
+
+            /* Main loop: ... */
+            .align 4
+3:          /* first perform a vertical convolution from memory to get the next
+             * 16 taps of the horizontal window into the register file...
+             */
+            fetch max_r=\max_r, labelc=\labelc, labelnc=\labelnc, reg=r8
+
+            /* ...then perform a horizontal convolution on that window to
+             * produce eight output bytes, and slide the window along.
+             * This has to be done twice to match the 16-way vertical pass.
+             * It would be preferable to have twice the work done in \core, but
+             * that would demand yet another variant on those macros and would
+             * perturb the register allocation severely.
+             */
+            \core
+            vst1.u8     {d31}, [r0]!
+            \core
+            vst1.u8     {d31}, [r0]!
+
+            sub         r3, r3, #16
+5:          subs        r4, r4, #16
+            bhi         3b
+            /* Here there's 16 or fewer bytes available before the edge of the
+             * source image.  x4 holds that count minus 16 (because it was
+             * decremented before the first iteration ran).  The last read may
+             * not be a whole chunk, and beyond that a fill value must be used.
+             *
+             * Of course, none of that matters if there's no more output to
+             * produce...
+             */
+            cmp         r3, #0
+            beq         5f
+
+            /* Oh well. */
+            adds        r4, r4, #16
+            bne         1f
+  .if \step==1
+            vdup.u16    q10, d19[3]
+            vdup.u16    q11, d19[3]
+  .else
+            vmov.u64    d20, d19
+            vmov.u64    d21, d19
+            vmov.u64    d22, d19
+            vmov.u64    d23, d19
+  .endif
+            b           3f
+
+            /* To avoid reading past end of input, rewind pointers by (16-r4)
+             * to ensure that they're exactly 16 bytes from the edge.
+             */
+1:          mov         r11, r4
+            bl          fetch_clampright\step
+            /* Now to put this padding to use, perform any remaining
+             * iterations.  This is done at half the rate of the main loop,
+             * because there's no longer pressure from a 16-lane window filler.
+             */
+3:          \core
+  .if \step==1
+            vdup.u16    q11, d23[3]
+  .else
+            vmov.u64    d22, d23
+  .endif
+            subs        r3, r3, #8
+            blo         4f
+            vst1.u8     {d31}, [r0]!
+            bne         3b
+            b           5f
+
+            /* If the final iteration contained 0 < l < 8 values, then perform
+             * a piecewise store of the final vector.
+             */
+4:          tst         r3, #4
+            beq         1f
+            vst1.u32    {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #4
+1:          tst         r3, #2
+            beq         1f
+            vst1.u16    {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #2
+1:          tst         r3, #1
+            beq         5f
+            vst1.u8     {d31[0]}, [r0]!
+            vext.u8     d31, d31, d31, #1
+5:          mov         r0, #0
+.endm
+
+.irp r, TUNED_LIST1, 25
+PRIVATE(convolve1_\r)
+            push        {r12,lr}
+
+            prefill     step=1, max_r=\r, label=.Lcnv1_\r
+
+            conv_body   core=hconv1_\r, step=1, max_r=\r, labelc=.Lcnv1_\r, labelnc=.Lcnvnc1_\r
+
+            pop         {r12,pc}
+END(convolve1_\r)
+.endr
+
+.irp r, TUNED_LIST4, 25
+PRIVATE(convolve4_\r)
+            push        {r12,lr}
+            sub         r9, sp, #0x200
+            sub         sp, sp, #0x200 + 0x400
+            bic         r9, r9, #0x3fc
+
+            /* r9 now points to a 0x200 byte buffer on the stack whose address
+             * has the low 10 bits clear.  This allows easy address calculation
+             * in the wrap-around cases.
+             */
+
+            prefill     step=4, max_r=\r, label=.Lcnv4_\r
+
+            conv_body   core=hconv4_\r, step=4, max_r=\r, labelc=.Lcnv4_\r, labelnc=.Lcnvnc4_\r
+
+            add         sp, sp, #0x200 + 0x400
+            pop         {r12,pc}
+END(convolve4_\r)
+.endr
+
+/* void rsdIntrinsicBlurU1_K(
+ *                  void *out,      // r0
+ *                  void *in,       // r1
+ *                  size_t w,       // r2
+ *                  size_t h,       // r3
+ *                  size_t p,       // [sp]
+ *                  size_t x,       // [sp,#4]
+ *                  size_t y,       // [sp,#8]
+ *                  size_t count,   // [sp,#12]
+ *                  size_t r,       // [sp,#16]
+ *                  uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU1_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+            ldr         r6, [sp,#112]   // y
+            ldr         r8, [sp,#108]   // x
+            ldr         r5, [sp,#120]   // r
+            sub         r4, r2, r8      // inlen = w - x
+            sub         r7, r3, r6      // h - y
+            ldr         r2, [sp,#104]   // pitch
+            ldr         r3, [sp,#116]   // count
+            sub         r7, r7, #1      // h - y - 1
+
+            ldr         r12, [sp,#124]
+
+            add         r1, r1, r8      // src += x
+
+            cmp         r6, r5
+            movhi       r6, r5          // rup = min(r, y)
+            cmp         r7, r5
+            movhi       r7, r5          // rdn = min(r, h - y - 1)
+
+            vld1.u16    {d0,d1,d2,d3}, [r12]!
+            vld1.u16    {d4,d5,d6}, [r12]!
+
+            adr         lr, 1f
+  .irp r, TUNED_LIST1
+            cmp         r5, #\r
+            bls         convolve1_\r
+  .endr
+            b           convolve1_25
+
+1:          vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU1_K)
+
+/* void rsdIntrinsicBlurU4_K(
+ *                  void *out,      // r0
+ *                  void *in,       // r1
+ *                  size_t w,       // r2
+ *                  size_t h,       // r3
+ *                  size_t p,       // [sp]
+ *                  size_t x,       // [sp,#4]
+ *                  size_t y,       // [sp,#8]
+ *                  size_t count,   // [sp,#12]
+ *                  size_t r,       // [sp,#16]
+ *                  uint16_t *tab); // [sp,#20]
+ */
+ENTRY(rsdIntrinsicBlurU4_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+            ldr         r6, [sp,#112]   // y
+            ldr         r8, [sp,#108]   // x
+            ldr         r5, [sp,#120]   // r
+            lsl         r8, r8, #2
+            rsb         r4, r8, r2, LSL #2 // inlen = (w - x)
+            sub         r7, r3, r6      // h - y
+            ldr         r2, [sp,#104]   // pitch
+            ldr         r3, [sp,#116]   // count
+            sub         r7, r7, #1      // h - y - 1
+            lsl         r3, r3, #2      // count
+
+            ldr         r12, [sp,#124]
+
+            add         r1, r1, r8      // in += x
+
+            cmp         r6, r5
+            movhi       r6, r5          // rup = min(r, y)
+            cmp         r7, r5
+            movhi       r7, r5          // rdn = min(r, h - y - 1)
+
+            vld1.u16    {d0,d1,d2,d3}, [r12]!
+            vld1.u16    {d4,d5,d6}, [r12]!
+
+            adr         lr, 1f
+  .irp r, TUNED_LIST4
+            cmp         r5, #\r
+            bls         convolve4_\r
+  .endr
+            b           convolve4_25
+
+1:          vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicBlurU4_K)
diff --git a/renderscript-toolkit/src/main/cpp/CMakeLists.txt b/renderscript-toolkit/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000..88aafd8
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,118 @@
+# Copyright (C) 2021 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html
+
+# Sets the minimum version of CMake required to build the native library.
+
+cmake_minimum_required(VERSION 3.10.2)
+
+# Declares and names the project.
+
+project("RenderScript Toolkit")
+
+set(can_use_assembler TRUE)
+enable_language(ASM)
+add_definitions(-v -DANDROID -DOC_ARM_ASM)
+
+set(CMAKE_CXX_FLAGS "-Wall -Wextra ${CMAKE_CXX_FLAGS}")
+
+#message( STATUS "Architecture: ${CMAKE_SYSTEM_PROCESSOR}" )
+#message( STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+#message( STATUS "CMAKE_CXX_FLAGS_DEBUG: ${CMAKE_CXX_FLAGS_DEBUG}")
+#message( STATUS "CMAKE_CXX_FLAGS_RELEASE: ${CMAKE_CXX_FLAGS_RELEASE}")
+#set(CMAKE_VERBOSE_MAKEFILE on)
+#set(CMAKE_CXX_FLAGS_DEBUG "-O0 -fno-limit-debug-info -g")
+#set(CMAKE_CXX_FLAGS_RELEASE "-O2 -Os -DNDEBUG")
+
+#TODO check that the definitions are all needed. Do they have impact outside of our code?
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL armv7-a)
+    add_definitions(-DARCH_ARM_USE_INTRINSICS -DARCH_ARM_HAVE_VFP)
+    set(ASM_SOURCES
+        Blend_neon.S
+        Blur_neon.S
+        ColorMatrix_neon.S
+        Convolve_neon.S
+        Lut3d_neon.S
+        Resize_neon.S
+        YuvToRgb_neon.S)
+endif()
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
+    add_definitions(-DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON)
+    set(ASM_SOURCES
+        Blend_advsimd.S
+        Blur_advsimd.S
+        ColorMatrix_advsimd.S
+        Convolve_advsimd.S
+        Lut3d_advsimd.S
+        Resize_advsimd.S
+        YuvToRgb_advsimd.S)
+endif()
+# TODO add also for x86
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+
+add_library(# Sets the name of the library.
+            renderscript-toolkit
+            # Sets the library as a shared library.
+            SHARED
+            # Provides a relative path to your source file(s).
+            Blend.cpp
+            Blur.cpp
+            ColorMatrix.cpp
+            Convolve3x3.cpp
+            Convolve5x5.cpp
+            Histogram.cpp
+            JniEntryPoints.cpp
+            Lut.cpp
+            Lut3d.cpp
+            RenderScriptToolkit.cpp
+            Resize.cpp
+            TaskProcessor.cpp
+            Utils.cpp
+            YuvToRgb.cpp
+            ${ASM_SOURCES})
+
+# Searches for a specified prebuilt library and stores the path as a
+# variable. Because CMake includes system libraries in the search path by
+# default, you only need to specify the name of the public NDK library
+# you want to add. CMake verifies that the library exists before
+# completing its build.
+
+find_library(# Sets the name of the path variable.
+             log-lib
+             # Specifies the name of the NDK library that
+             # you want CMake to locate.
+             log )
+
+# Specifies libraries CMake should link to your target library. You
+# can link multiple libraries, such as libraries you define in this
+# build script, prebuilt third-party libraries, or system libraries.
+
+target_link_libraries(# Specifies the target library.
+                      renderscript-toolkit
+
+                      cpufeatures
+                      jnigraphics
+                      # Links the target library to the log library
+                      # included in the NDK.
+                      ${log-lib} )
+
+include(AndroidNdkModules)
+android_ndk_import_module_cpufeatures()
diff --git a/renderscript-toolkit/src/main/cpp/ColorMatrix.cpp b/renderscript-toolkit/src/main/cpp/ColorMatrix.cpp
new file mode 100644
index 0000000..4aa8ed3
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/ColorMatrix.cpp
@@ -0,0 +1,1064 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+#include <cassert>
+#include <cstdint>
+#include <sys/mman.h>
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.ColorMatrix"
+
+/*  uint kernel
+ *  Q0  D0:  Load slot for R
+ *      D1:  Load slot for G
+ *  Q1  D2:  Load slot for B
+ *      D3:  Load slot for A
+ *  Q2  D4:  Matrix
+ *      D5:  =
+ *  Q3  D6:  =
+ *      D7:  =
+ *  Q4  D8:  Add R
+ *      D9:
+ *  Q5  D10: Add G
+ *      D11:
+ *  Q6  D12: Add B
+ *      D13:
+ *  Q7  D14: Add A
+ *      D15:
+ *  Q8  D16:  I32: R Sum
+ *      D17:
+ *  Q9  D18:  I32: G Sum
+ *      D19:
+ *  Q10 D20:  I32: B Sum
+ *      D21:
+ *  Q11 D22:  I32: A Sum
+ *      D23:
+ *  Q12 D24:  U16: expanded R
+ *      D25:
+ *  Q13 D26:  U16: expanded G
+ *      D27:
+ *  Q14 D28:  U16: expanded B
+ *      D29:
+ *  Q15 D30:  U16: expanded A
+ *      D31:
+ *
+ */
+
+/*  float kernel
+ *  Q0  D0:  Load slot for R
+ *      D1:  =
+ *  Q1  D2:  Load slot for G
+ *      D3:  =
+ *  Q2  D4:  Load slot for B
+ *      D5:  =
+ *  Q3  D6:  Load slot for A
+ *      D7:  =
+ *  Q4  D8:  Matrix
+ *      D9:  =
+ *  Q5  D10: =
+ *      D11: =
+ *  Q6  D12: =
+ *      D13: =
+ *  Q7  D14: =
+ *      D15: =
+ *  Q8  D16: Add R
+ *      D17: =
+ *  Q9  D18: Add G
+ *      D19: =
+ *  Q10 D20: Add B
+ *      D21: =
+ *  Q11 D22: Add A
+ *      D23: =
+ *  Q12 D24: Sum R
+ *      D25: =
+ *  Q13 D26: Sum G
+ *      D27: =
+ *  Q14 D28: Sum B
+ *      D29: =
+ *  Q15 D30: Sum A
+ *      D31: =
+ *
+ */
+
+typedef union {
+    uint64_t key;
+    struct {
+        uint32_t inVecSize          :2;  // [0 - 1]
+        uint32_t outVecSize         :2;  // [2 - 3]
+        uint32_t inType             :4;  // [4 - 7]
+        uint32_t outType            :4;  // [8 - 11]
+        uint32_t dot                :1;  // [12]
+        uint32_t _unused1           :1;  // [13]
+        uint32_t copyAlpha          :1;  // [14]
+        uint32_t _unused2           :1;  // [15]
+        uint32_t coeffMask          :16; // [16-31]
+        uint32_t addMask            :4;  // [32-35]
+    } u;
+} Key_t;
+
+/* The two data types and their value, as specified in the RenderScript documentation.
+ * Only RS_TYPE_UNSIGNED_8 is currently supported.
+ *
+ * TODO: The actual values of these constants are likely not important. We may be
+ * able to simplify the key related code.
+ */
+const int RS_TYPE_UNSIGNED_8 = 8;
+const int RS_TYPE_FLOAT_32 = 2;
+
+//Re-enable when intrinsic is fixed
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+typedef struct {
+    void (*column[4])();
+    void (*store)();
+    void (*load)();
+    void (*store_end)();
+    void (*load_end)();
+} FunctionTab_t;
+
+extern "C" void rsdIntrinsicColorMatrix_int_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             int16_t const *mult, int32_t const *add);
+
+extern "C" void rsdIntrinsicColorMatrix_float_K(
+             void *out, void const *in, size_t count,
+             FunctionTab_t const *fns,
+             float const *mult, float const *add);
+
+/* The setup functions fill in function tables to be used by above functions;
+ * this code also eliminates jump-to-another-jump cases by short-circuiting
+ * empty functions.  While it's not performance critical, it works out easier
+ * to write the set-up code in assembly than to try to expose the same symbols
+ * and write the code in C.
+ */
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+             FunctionTab_t *fns,
+             uint32_t mask, int dt, int st);
+
+extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
+             FunctionTab_t *fns,
+             uint32_t mask, int dt, int st);
+#endif //  ARCH_ARM64_USE_INTRINSICS
+
+class ColorMatrixTask : public Task {
+    const void* mIn;
+    void* mOut;
+    size_t mInputVectorSize;
+    uint32_t mOutstep;
+    uint32_t mInstep;
+
+    float mFp[16];
+    float mFpa[4];
+
+    // The following four fields are read as constants
+    // by the SIMD assembly code.
+    int16_t mIp[16];
+    int mIpa[4];
+    float mTmpFp[16];
+    float mTmpFpa[4];
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+    FunctionTab_t mFnTab;
+#endif
+
+    void kernel(uchar* out, uchar* in, uint32_t xstart, uint32_t xend);
+    void updateCoeffCache(float fpMul, float addMul);
+
+    Key_t mLastKey;
+    unsigned char* mBuf;
+    size_t mBufSize;
+
+    bool build(Key_t key);
+    void (*mOptKernel)(void* dst, const void* src, const int16_t* coef, uint32_t count);
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    Key_t computeKey(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+    void preLaunch(size_t inVectorSize, int inType, size_t outVectorSize, int outType);
+#else
+    Key_t computeKey(size_t inVectorSize, size_t outVectorSize);
+    void preLaunch(size_t inVectorSize, size_t outVectorSize);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    ColorMatrixTask(const void* in, void* out, size_t inputVectorSize, size_t outputVectorSize,
+                    size_t sizeX, size_t sizeY, const float* matrix, const float* addVector,
+                    const Restriction* restriction)
+        : Task{sizeX, sizeY, outputVectorSize, true, restriction},
+          mIn{in},
+          mOut{out},
+          mInputVectorSize{inputVectorSize} {
+        mLastKey.key = 0;
+        mBuf = nullptr;
+        mBufSize = 0;
+        mOptKernel = nullptr;
+
+        mOutstep = paddedSize(outputVectorSize);
+        mInstep = paddedSize(inputVectorSize);
+
+        memcpy(mFp, matrix, sizeof(mFp));
+        memcpy(mFpa, addVector, sizeof(mFpa));
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+        // For float support, we'll have to pass the type in the constructor too.
+        preLaunch(inputVectorSize, RS_TYPE_UNSIGNED_8, outputVectorSize, RS_TYPE_UNSIGNED_8);
+#else
+        preLaunch(inputVectorSize, outputVectorSize);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    }
+    ~ColorMatrixTask() {
+        if (mBuf) munmap(mBuf, mBufSize);
+        mBuf = nullptr;
+        mOptKernel = nullptr;
+    }
+};
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, int inType, size_t outVectorSize,
+                                  int outType) {
+    Key_t key;
+    key.key = 0;
+
+    // Compute a unique code key for this operation
+
+    // Add to the key the input and output types
+    bool hasFloat = false;
+    if (inType == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key.u.inType = RS_TYPE_FLOAT_32;
+    }
+    if (outType == RS_TYPE_FLOAT_32) {
+        hasFloat = true;
+        key.u.outType = RS_TYPE_FLOAT_32;
+    }
+
+    // Mask in the bits indicating which coefficients in the
+    // color matrix are needed.
+    if (hasFloat) {
+        for (uint32_t i=0; i < 16; i++) {
+            if (fabs(mFp[i]) != 0.f) {
+                key.u.coeffMask |= 1 << i;
+            }
+        }
+        if (fabs(mFpa[0]) != 0.f) key.u.addMask |= 0x1;
+        if (fabs(mFpa[1]) != 0.f) key.u.addMask |= 0x2;
+        if (fabs(mFpa[2]) != 0.f) key.u.addMask |= 0x4;
+        if (fabs(mFpa[3]) != 0.f) key.u.addMask |= 0x8;
+
+    } else {
+#else
+Key_t ColorMatrixTask::computeKey(size_t inVectorSize, size_t outVectorSize) {
+    Key_t key;
+    key.key = 0;
+
+    // Compute a unique code key for this operation
+    {
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+        for (uint32_t i=0; i < 16; i++) {
+            if (mIp[i] != 0) {
+                key.u.coeffMask |= 1 << i;
+            }
+        }
+        if (mIpa[0] != 0) key.u.addMask |= 0x1;
+        if (mIpa[1] != 0) key.u.addMask |= 0x2;
+        if (mIpa[2] != 0) key.u.addMask |= 0x4;
+        if (mIpa[3] != 0) key.u.addMask |= 0x8;
+    }
+
+    // Look for a dot product where the r,g,b colums are the same
+    if ((mIp[0] == mIp[1]) && (mIp[0] == mIp[2]) &&
+        (mIp[4] == mIp[5]) && (mIp[4] == mIp[6]) &&
+        (mIp[8] == mIp[9]) && (mIp[8] == mIp[10]) &&
+        (mIp[12] == mIp[13]) && (mIp[12] == mIp[14])) {
+
+        if (!key.u.addMask) key.u.dot = 1;
+    }
+
+    // Is alpha a simple copy
+    if (!(key.u.coeffMask & 0x0888) && (mIp[15] == 256) && !(key.u.addMask & 0x8)) {
+        key.u.copyAlpha = !(key.u.inType || key.u.outType);
+    }
+
+    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+
+    switch (inVectorSize) {
+    case 4:
+        key.u.inVecSize = 3;
+        break;
+    case 3:
+        key.u.inVecSize = 2;
+        key.u.coeffMask &= ~0xF000;
+        break;
+    case 2:
+        key.u.inVecSize = 1;
+        key.u.coeffMask &= ~0xFF00;
+        break;
+    default:
+        key.u.coeffMask &= ~0xFFF0;
+        break;
+    }
+
+    switch (outVectorSize) {
+    case 4:
+        key.u.outVecSize = 3;
+        break;
+    case 3:
+        key.u.outVecSize = 2;
+        key.u.coeffMask &= ~0x8888;
+        key.u.addMask &= 7;
+        break;
+    case 2:
+        key.u.outVecSize = 1;
+        key.u.coeffMask &= ~0xCCCC;
+        key.u.addMask &= 3;
+        break;
+    default:
+        key.u.coeffMask &= ~0xEEEE;
+        key.u.addMask &= 1;
+        break;
+    }
+
+    if (key.u.inType && !key.u.outType) {
+        key.u.addMask |= 1;
+        if (key.u.outVecSize > 0) key.u.addMask |= 2;
+        if (key.u.outVecSize > 1) key.u.addMask |= 4;
+        if (key.u.outVecSize > 2) key.u.addMask |= 8;
+    }
+
+    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
+    return key;
+}
+
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#define DEF_SYM(x)                                  \
+    extern "C" uint32_t _N_ColorMatrix_##x;      \
+    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
+    extern "C" uint32_t _N_ColorMatrix_##x##_len;
+
+DEF_SYM(prefix_i)
+DEF_SYM(prefix_f)
+DEF_SYM(postfix1)
+DEF_SYM(postfix2)
+
+DEF_SYM(load_u8_4)
+DEF_SYM(load_u8_3)
+DEF_SYM(load_u8_2)
+DEF_SYM(load_u8_1)
+DEF_SYM(load_u8f_4)
+DEF_SYM(load_u8f_3)
+DEF_SYM(load_u8f_2)
+DEF_SYM(load_u8f_1)
+
+DEF_SYM(load_f32_4)
+DEF_SYM(load_f32_3)
+DEF_SYM(load_f32_2)
+DEF_SYM(load_f32_1)
+
+DEF_SYM(store_u8_4)
+DEF_SYM(store_u8_2)
+DEF_SYM(store_u8_1)
+
+DEF_SYM(store_f32_4)
+DEF_SYM(store_f32_3)
+DEF_SYM(store_f32_2)
+DEF_SYM(store_f32_1)
+DEF_SYM(store_f32u_4)
+DEF_SYM(store_f32u_2)
+DEF_SYM(store_f32u_1)
+
+DEF_SYM(unpack_u8_4)
+DEF_SYM(unpack_u8_3)
+DEF_SYM(unpack_u8_2)
+DEF_SYM(unpack_u8_1)
+DEF_SYM(pack_u8_4)
+DEF_SYM(pack_u8_3)
+DEF_SYM(pack_u8_2)
+DEF_SYM(pack_u8_1)
+DEF_SYM(dot)
+DEF_SYM(add_0_u8)
+DEF_SYM(add_1_u8)
+DEF_SYM(add_2_u8)
+DEF_SYM(add_3_u8)
+
+#define ADD_CHUNK(x) \
+    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
+    buf += _N_ColorMatrix_##x##_len
+
+
+static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
+    size_t off = (target - buf - 8) >> 2;
+    assert(((off & 0xff000000) == 0) ||
+           ((off & 0xff000000) == 0xff000000));
+
+    uint32_t op = (condition << 28);
+    op |= 0xa << 24;  // branch
+    op |= 0xffffff & off;
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
+    assert(vd < 32);
+    assert(vm < 32);
+    assert(vn < 32);
+
+    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
+    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
+    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
+    return op;
+}
+
+static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmlal.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmull.s16 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vqadd.s32 Q#1, Q#1, Q#2
+    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmlal.f32 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2,
+                              uint32_t src_d2_s) {
+    //vmull.f32 Q#1, D#1, D#2[#]
+    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vadd.f32 Q#1, D#1, D#2
+    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
+    //vmov.32 Q#1, #imm
+    assert(imm == 0);
+    (void) imm; // Avoid unused parameter warnings for non-debug builds
+    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+
+static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
+    //vadd.f32 Q#1, D#1, D#2
+    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
+    ((uint32_t *)buf)[0] = op;
+    return buf + 4;
+}
+#endif
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+                                  const int16_t *coef, uint32_t count);
+
+void * selectKernel(Key_t key)
+{
+    void * kernel = nullptr;
+
+    // inType, outType float if nonzero
+    if (!(key.u.inType || key.u.outType)) {
+        if (key.u.dot)
+            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
+        else if (key.u.copyAlpha)
+            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
+        else
+            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
+    }
+
+    return kernel;
+}
+#endif
+
+bool ColorMatrixTask::build(Key_t key) {
+#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+    mBufSize = 4096;
+    //StopWatch build_time("rs cm: build time");
+    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
+                                  MAP_PRIVATE | MAP_ANON, -1, 0);
+    if (mBuf == MAP_FAILED) {
+        mBuf = NULL;
+        return false;
+    }
+
+    uint8_t *buf = mBuf;
+    uint8_t *buf2 = nullptr;
+
+    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
+    int opInit[4] = {0, 0, 0, 0};
+
+    memset(ops, 0, sizeof(ops));
+    for (int i=0; i < 4; i++) {
+        if (key.u.coeffMask & (1 << (i*4))) {
+            ops[i][0] = 0x2 | opInit[0];
+            opInit[0] = 1;
+        }
+        if (!key.u.dot) {
+            if (key.u.coeffMask & (1 << (1 + i*4))) {
+                ops[i][1] = 0x2 | opInit[1];
+                opInit[1] = 1;
+            }
+            if (key.u.coeffMask & (1 << (2 + i*4))) {
+                ops[i][2] = 0x2 | opInit[2];
+                opInit[2] = 1;
+            }
+        }
+        if (!key.u.copyAlpha) {
+            if (key.u.coeffMask & (1 << (3 + i*4))) {
+                ops[i][3] = 0x2 | opInit[3];
+                opInit[3] = 1;
+            }
+        }
+    }
+
+    if (key.u.inType || key.u.outType) {
+        key.u.copyAlpha = 0;
+        ADD_CHUNK(prefix_f);
+        buf2 = buf;
+
+        // Load the incoming r,g,b,a as needed
+        if (key.u.inType) {
+            switch(key.u.inVecSize) {
+            case 3:
+                ADD_CHUNK(load_f32_4);
+                break;
+            case 2:
+                ADD_CHUNK(load_f32_3);
+                break;
+            case 1:
+                ADD_CHUNK(load_f32_2);
+                break;
+            case 0:
+                ADD_CHUNK(load_f32_1);
+                break;
+            }
+        } else {
+            switch(key.u.inVecSize) {
+            case 3:
+                ADD_CHUNK(load_u8f_4);
+                break;
+            case 2:
+                ADD_CHUNK(load_u8f_3);
+                break;
+            case 1:
+                ADD_CHUNK(load_u8f_2);
+                break;
+            case 0:
+                ADD_CHUNK(load_u8f_1);
+                break;
+            }
+        }
+
+        for (int i=0; i < 4; i++) {
+            for (int j=0; j < 4; j++) {
+                switch(ops[i][j]) {
+                case 0:
+                    break;
+                case 2:
+                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+                    break;
+                case 3:
+                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
+                    break;
+                }
+            }
+        }
+        for (int j=0; j < 4; j++) {
+            if (opInit[j]) {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVADD_F32(buf, j, 12+j, 8+j);
+                } else {
+                    buf = addVORR_32(buf, j, 12+j, 12+j);
+                }
+            } else {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVORR_32(buf, j, 8+j, 8+j);
+                } else {
+                    buf = addVMOV_32(buf, j, 0);
+                }
+            }
+        }
+
+        if (key.u.outType) {
+            switch(key.u.outVecSize) {
+            case 3:
+                ADD_CHUNK(store_f32_4);
+                break;
+            case 2:
+                ADD_CHUNK(store_f32_3);
+                break;
+            case 1:
+                ADD_CHUNK(store_f32_2);
+                break;
+            case 0:
+                ADD_CHUNK(store_f32_1);
+                break;
+            }
+        } else {
+            switch(key.u.outVecSize) {
+            case 3:
+            case 2:
+                ADD_CHUNK(store_f32u_4);
+                break;
+            case 1:
+                ADD_CHUNK(store_f32u_2);
+                break;
+            case 0:
+                ADD_CHUNK(store_f32u_1);
+                break;
+            }
+        }
+
+
+    } else {
+        // Add the function prefix
+        // Store the address for the loop return
+        ADD_CHUNK(prefix_i);
+        buf2 = buf;
+
+        // Load the incoming r,g,b,a as needed
+        switch(key.u.inVecSize) {
+        case 3:
+            ADD_CHUNK(load_u8_4);
+            if (key.u.copyAlpha) {
+                ADD_CHUNK(unpack_u8_3);
+            } else {
+                ADD_CHUNK(unpack_u8_4);
+            }
+            break;
+        case 2:
+            ADD_CHUNK(load_u8_3);
+            ADD_CHUNK(unpack_u8_3);
+            break;
+        case 1:
+            ADD_CHUNK(load_u8_2);
+            ADD_CHUNK(unpack_u8_2);
+            break;
+        case 0:
+            ADD_CHUNK(load_u8_1);
+            ADD_CHUNK(unpack_u8_1);
+            break;
+        }
+
+        // Add multiply and accumulate
+        // use MULL to init the output register,
+        // use MLAL from there
+        for (int i=0; i < 4; i++) {
+            for (int j=0; j < 4; j++) {
+                switch(ops[i][j]) {
+                case 0:
+                    break;
+                case 2:
+                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
+                    break;
+                case 3:
+                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
+                    break;
+                }
+            }
+        }
+        for (int j=0; j < 4; j++) {
+            if (opInit[j]) {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
+                }
+            } else {
+                if (key.u.addMask & (1 << j)) {
+                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
+                }
+            }
+        }
+
+        // If we have a dot product, perform the special pack.
+        if (key.u.dot) {
+            ADD_CHUNK(pack_u8_1);
+            ADD_CHUNK(dot);
+        } else {
+            switch(key.u.outVecSize) {
+            case 3:
+                if (key.u.copyAlpha) {
+                    ADD_CHUNK(pack_u8_3);
+                } else {
+                    ADD_CHUNK(pack_u8_4);
+                }
+                break;
+            case 2:
+                ADD_CHUNK(pack_u8_3);
+                break;
+            case 1:
+                ADD_CHUNK(pack_u8_2);
+                break;
+            case 0:
+                ADD_CHUNK(pack_u8_1);
+                break;
+            }
+        }
+
+        // Write out result
+        switch(key.u.outVecSize) {
+        case 3:
+        case 2:
+            ADD_CHUNK(store_u8_4);
+            break;
+        case 1:
+            ADD_CHUNK(store_u8_2);
+            break;
+        case 0:
+            ADD_CHUNK(store_u8_1);
+            break;
+        }
+    }
+
+    if (key.u.inType != key.u.outType) {
+        key.u.copyAlpha = 0;
+        key.u.dot = 0;
+    }
+
+    // Loop, branch, and cleanup
+    ADD_CHUNK(postfix1);
+    buf = addBranch(buf, buf2, 0x01);
+    ADD_CHUNK(postfix2);
+
+    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
+    if (ret == -1) {
+        ALOGE("mprotect error %i", ret);
+        return false;
+    }
+
+    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
+    return true;
+#else
+    (void) key; // Avoid unused parameter warning.
+    return false;
+#endif
+}
+
+void ColorMatrixTask::updateCoeffCache(float fpMul, float addMul) {
+    for(int ct=0; ct < 16; ct++) {
+        mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+        mTmpFp[ct] = mFp[ct] * fpMul;
+        //ALOGE("mat %i %f  %f", ct, mFp[ct], tmpFp[ct]);
+    }
+
+    float add = 0.f;
+    if (fpMul > 254.f) add = 0.5f;
+    for(int ct=0; ct < 4; ct++) {
+        mTmpFpa[ct] = mFpa[ct] * addMul + add;
+        //ALOGE("mFpa %i %f  %f", ct, mFpa[ct], tmpFpa[ct * 4 + 0]);
+    }
+
+    for(int ct=0; ct < 4; ct++) {
+        mIpa[ct] = (int)(mFpa[ct] * 65536.f + 0.5f);
+    }
+}
+
+
+
+static void One(void *out,
+                const void *py, const float* coeff, const float *add,
+                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
+
+    float4 f = 0.f;
+    if (fin) {
+        switch(vsin) {
+        case 3:
+            f = ((const float4 *)py)[0];
+            break;
+        case 2:
+            f = ((const float4 *)py)[0];
+            f.w = 0.f;
+            break;
+        case 1:
+            f.xy = ((const float2 *)py)[0];
+            break;
+        case 0:
+            f.x = ((const float *)py)[0];
+            break;
+        }
+    } else {
+        switch(vsin) {
+        case 3:
+            f = convert<float4>(((const uchar4 *)py)[0]);
+            break;
+        case 2:
+            f = convert<float4>(((const uchar4 *)py)[0]);
+            f.w = 0.f;
+            break;
+        case 1:
+            f.xy = convert<float2>(((const uchar2 *)py)[0]);
+            break;
+        case 0:
+            f.x = (float)(((const uchar *)py)[0]);
+            break;
+        }
+    }
+    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
+
+    float4 sum;
+    sum.x = f.x * coeff[0] +
+            f.y * coeff[4] +
+            f.z * coeff[8] +
+            f.w * coeff[12];
+    sum.y = f.x * coeff[1] +
+            f.y * coeff[5] +
+            f.z * coeff[9] +
+            f.w * coeff[13];
+    sum.z = f.x * coeff[2] +
+            f.y * coeff[6] +
+            f.z * coeff[10] +
+            f.w * coeff[14];
+    sum.w = f.x * coeff[3] +
+            f.y * coeff[7] +
+            f.z * coeff[11] +
+            f.w * coeff[15];
+    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
+
+    sum.x += add[0];
+    sum.y += add[1];
+    sum.z += add[2];
+    sum.w += add[3];
+
+
+    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
+    if (fout) {
+        switch(vsout) {
+        case 3:
+        case 2:
+            ((float4 *)out)[0] = sum;
+            break;
+        case 1:
+            ((float2 *)out)[0] = sum.xy;
+            break;
+        case 0:
+            ((float *)out)[0] = sum.x;
+            break;
+        }
+    } else {
+        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
+        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
+        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
+        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
+
+        switch(vsout) {
+        case 3:
+        case 2:
+            ((uchar4 *)out)[0] = convert<uchar4>(sum);
+            break;
+        case 1:
+            ((uchar2 *)out)[0] = convert<uchar2>(sum.xy);
+            break;
+        case 0:
+            ((uchar *)out)[0] = sum.x;
+            break;
+        }
+    }
+    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2],
+    //      ((float *)out)[3]);
+}
+
+void ColorMatrixTask::kernel(uchar *out, uchar *in, uint32_t xstart, uint32_t xend) {
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    uint32_t vsin = mLastKey.u.inVecSize;
+    uint32_t vsout = mLastKey.u.outVecSize;
+    bool floatIn = !!mLastKey.u.inType;
+    bool floatOut = !!mLastKey.u.outType;
+
+    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
+
+    if(x2 > x1) {
+        int32_t len = x2 - x1;
+        if (mUsesSimd) {
+            if((mOptKernel != nullptr) && (len >= 4)) {
+                // The optimized kernel processes 4 pixels at once
+                // and requires a minimum of 1 chunk of 4
+                mOptKernel(out, in, mIp, len >> 2);
+                // Update the len and pointers so the generic code can
+                // finish any leftover pixels
+                len &= ~3;
+                x1 += len;
+                out += mOutstep * len;
+                in += mInstep * len;
+            }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+            else {
+                if (mLastKey.u.inType == RS_TYPE_FLOAT_32 ||
+                    mLastKey.u.outType == RS_TYPE_FLOAT_32) {
+                    // Currently this generates off by one errors.
+                    // rsdIntrinsicColorMatrix_float_K(out, in, len, &mFnTab, tmpFp, tmpFpa);
+                    // x1 += len;
+                    // out += outstep * len;
+                    // in += instep * len;
+                } else {
+                    rsdIntrinsicColorMatrix_int_K(out, in, len, &mFnTab, mIp, mIpa);
+                    x1 += len;
+                    out += mOutstep * len;
+                    in += mInstep * len;
+                }
+            }
+#endif
+        }
+
+        while(x1 != x2) {
+            One(out, in, mTmpFp, mTmpFpa, vsin, vsout, floatIn, floatOut);
+            out += mOutstep;
+            in += mInstep;
+            x1++;
+        }
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ColorMatrixTask::preLaunch(size_t inVectorSize, int inType, size_t outVectorSize,
+                                int outType) {
+    if (inType == outType) {
+        if (outType == RS_TYPE_UNSIGNED_8) {
+            updateCoeffCache(1.f, 255.f);
+        } else {
+            updateCoeffCache(1.f, 1.f);
+        }
+    } else {
+        if (outType == RS_TYPE_UNSIGNED_8) {
+            updateCoeffCache(255.f, 255.f);
+        } else {
+            updateCoeffCache(1.f / 255.f, 1.f);
+        }
+    }
+
+    Key_t key = computeKey(inVectorSize, inType, outVectorSize, outType);
+#else
+void ColorMatrixTask::preLaunch(size_t inVectorSize, size_t outVectorSize) {
+    updateCoeffCache(1.f, 255.f);
+
+    Key_t key = computeKey(inVectorSize, outVectorSize);
+#endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+#if defined(ARCH_X86_HAVE_SSSE3)
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
+        // mOptKernel =
+        //     (void (*)(void *, const void *, const int16_t *, uint32_t)) selectKernel(key);
+        mLastKey = key;
+    }
+
+#else //if !defined(ARCH_X86_HAVE_SSSE3)
+    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
+        if (mBuf) munmap(mBuf, mBufSize);
+        mBuf = nullptr;
+        mOptKernel = nullptr;
+        if (build(key)) {
+            mOptKernel = (void (*)(void *, const void *, const int16_t *, uint32_t)) mBuf;
+        }
+#if defined(ARCH_ARM64_USE_INTRINSICS)
+        else {
+            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
+            uint32_t mm = 0;
+            int i;
+            for (i = 0; i < 4; i++)
+            {
+                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
+                m = ((m * 0x249) >> 9) & 15;
+                m |= ((key.u.addMask >> i) & 1) << 4;
+                mm |= m << (i * 5);
+            }
+
+            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
+                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
+            } else {
+                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
+            }
+        }
+#endif
+        mLastKey = key;
+    }
+#endif //if !defined(ARCH_X86_HAVE_SSSE3)
+}
+
+void ColorMatrixTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        uchar* in = ((uchar*)mIn) + offset * paddedSize(mInputVectorSize);
+        uchar* out = ((uchar*)mOut) + offset * paddedSize(mVectorSize);
+        kernel(out, in, startX, endX);
+    }
+}
+
+static const float fourZeroes[]{0.0f, 0.0f, 0.0f, 0.0f};
+
+void RenderScriptToolkit::colorMatrix(const void* in, void* out, size_t inputVectorSize,
+                                      size_t outputVectorSize, size_t sizeX, size_t sizeY,
+                                      const float* matrix, const float* addVector,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (inputVectorSize < 1 || inputVectorSize > 4) {
+        ALOGE("The inputVectorSize should be between 1 and 4. %zu provided.", inputVectorSize);
+        return;
+    }
+    if (outputVectorSize < 1 || outputVectorSize > 4) {
+        ALOGE("The outputVectorSize should be between 1 and 4. %zu provided.", outputVectorSize);
+        return;
+    }
+#endif
+
+    if (addVector == nullptr) {
+        addVector = fourZeroes;
+    }
+    ColorMatrixTask task(in, out, inputVectorSize, outputVectorSize, sizeX, sizeY, matrix,
+                         addVector, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/ColorMatrix_advsimd.S b/renderscript-toolkit/src/main/cpp/ColorMatrix_advsimd.S
new file mode 100644
index 0000000..9064553
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/ColorMatrix_advsimd.S
@@ -0,0 +1,1277 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro vmxx_f32 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fmla            \opd, \opa, \opb
+    .else
+        fmul            \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vadd_f32 i, mask, opd, opa, opb, querkysyntax1, querkysyntax2
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1)
+        fadd            \opd, \opa, \opb
+    .else
+        mov             \querkysyntax1, \querkysyntax2
+    .endif
+  .endif
+.endm
+
+.macro vmxx_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal           \opd, \opa, \opb
+    .else
+        smull           \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+.macro vmxx2_s16 i, mask, opd, opa, opb
+  .if (\i) & \mask
+    .if (\i) & (\mask - 1 + 16)
+        smlal2          \opd, \opa, \opb
+    .else
+        smull2          \opd, \opa, \opb
+    .endif
+  .endif
+.endm
+
+/* x0 = dst
+ * x1 = src
+ * x2 = count
+ * x3 = params
+ * x4 = column0_fn
+ * x5 = column1_fn
+ * x6 = column2_fn
+ * x7 = column3_fn
+ * x8 = store_fn
+ * x9 = load_fn
+ */
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
+.align 6
+colormatrix_int_col0_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+colormatrix_int_col0_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[0]
+            dup         v7.4s, v4.s[0]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
+            sqshrun     v8.4h, v6.4s, #8
+            sqshrun2    v8.8h, v7.4s, #8
+            br          x5
+
+.align 6
+colormatrix_int_col1_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+colormatrix_int_col1_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[1]
+            dup         v7.4s, v4.s[1]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
+            sqshrun     v9.4h, v6.4s, #8
+            sqshrun2    v9.8h, v7.4s, #8
+            br          x6
+
+.align 6
+colormatrix_int_col2_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+colormatrix_int_col2_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[2]
+            dup         v7.4s, v4.s[2]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
+            sqshrun     v10.4h, v6.4s, #8
+            sqshrun2    v10.8h, v7.4s, #8
+            br          x7
+
+.align 6
+colormatrix_int_col3_\i:
+      .if \i & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+colormatrix_int_col3_n\i:
+      .if (\i^31) & 16
+            dup         v6.4s, v4.s[3]
+            dup         v7.4s, v4.s[3]
+      .endif
+            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
+            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
+            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
+            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
+            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
+            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
+            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
+            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
+            sqshrun     v11.4h, v6.4s, #8
+            sqshrun2    v11.8h, v7.4s, #8
+            br          x8
+
+.align 5
+colormatrix_float_col0_\i:
+            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
+            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
+            br          x5
+
+.align 4
+colormatrix_float_col0_n\i:
+            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
+            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
+            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
+            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
+            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
+            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
+            br          x5
+
+.align 5
+colormatrix_float_col1_\i:
+            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
+            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
+            br          x6
+
+.align 4
+colormatrix_float_col1_n\i:
+            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
+            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
+            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
+            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
+            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
+            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
+            br          x6
+
+.align 5
+colormatrix_float_col2_\i:
+            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
+            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
+            br          x7
+
+.align 4
+colormatrix_float_col2_n\i:
+            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
+            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
+            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
+            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
+            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
+            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
+            br          x7
+
+.align 5
+colormatrix_float_col3_\i:
+            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
+            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
+            br          x8
+
+.align 4
+colormatrix_float_col3_n\i:
+            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
+            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
+            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
+            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
+            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
+            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
+            br          x8
+
+.endr
+
+.align 6
+colormatrix_float_ldu4:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+.align 5
+colormatrix_int_ldu4:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+.align 6
+colormatrix_float_ldu3:
+            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_int_ldu3:
+            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+.align 5
+colormatrix_float_ldu1:
+            ld1         {v20.8b}, [x1], #8
+            uxtl        v20.8h, v20.8b
+            uxtl        v12.4s, v20.4h
+            uxtl2       v20.4s, v20.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+.align 6
+colormatrix_float_ldu2:
+            ld2         {v20.8b,v21.8b}, [x1], #16
+            uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+.align 4
+colormatrix_int_ldu2:
+            ld2         {v12.8b,v13.8b}, [x1], #16
+            uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            br          x4
+
+.align 6
+colormatrix_float_stu4:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            uqxtn       v27.8b, v27.8h
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu4:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu3:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            uqxtn       v26.8b, v26.8h
+            movi        v27.8b, #0
+            subs        x2, x2, #8
+            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+.align 4
+colormatrix_int_ldu1:
+            ld1         {v12.8b}, [x1], #8
+            uxtl        v12.8h, v12.8b
+            br          x4
+
+.align 5
+colormatrix_int_stu3:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            subs        x2, x2, #8
+            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
+            blo         colormatrix_int_end
+            br          x9
+
+.align 6
+colormatrix_float_stu2:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            uqxtn       v24.8b, v24.8h
+            uqxtn       v25.8b, v25.8h
+            subs        x2, x2, #8
+            st2         {v24.8b,v25.8b}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_int_stu2:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            subs        x2, x2, #8
+            st2         {v12.8b,v13.8b}, [x0], #16
+            blo         colormatrix_int_end
+            br          x9
+
+.align 5
+colormatrix_int_stu1:
+            uqxtn       v12.8b, v8.8h
+            subs        x2, x2, #8
+            st1         {v12.8b}, [x0], #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_float_ldf3:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 6
+colormatrix_float_stu1:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            uqxtn       v24.8b, v24.8h
+            subs        x2, x2, #8
+            st1         {v24.8b}, [x0], #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_stf3:
+            movi        v11.16b, #0
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            movi        v19.16b, #0
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+.align 5
+colormatrix_float_stf4:
+            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
+            subs        x2, x2, #8
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf4:
+            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+            br          x4
+
+.align 5
+colormatrix_float_stf2:
+            st2         {v8.4s, v9.4s}, [x0], #32
+            subs        x2, x2, #8
+            st2         {v16.4s, v17.4s}, [x0], #32
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf2:
+            ld2         {v12.4s,v13.4s}, [x1], #32
+            ld2         {v20.4s,v21.4s}, [x1], #32
+            br          x4
+
+.align 5
+colormatrix_float_stf1:
+            st1         {v8.4s}, [x0], #16
+            subs        x2, x2, #8
+            st1         {v16.4s}, [x0], #16
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_ldf1:
+            ld1         {v12.4s}, [x1], #16
+            ld1         {v20.4s}, [x1], #16
+            br          x4
+
+colormatrix_int_stu1_end:
+            uqxtn       v12.8b, v8.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu2_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            zip1        v12.16b, v12.16b, v13.16b
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu3_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu4_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+
+colormatrix_int_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[3], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[5], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[9], [x1], #1
+1:          uxtl2       v12.8h, v15.16b
+            br          x4
+
+colormatrix_int_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uzp1        v14.16b, v15.16b, v15.16b
+            uzp2        v15.16b, v15.16b, v15.16b
+            uxtl        v12.8h, v14.8b
+            uxtl        v13.8h, v15.8b
+            br          x4
+
+colormatrix_int_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+colormatrix_int_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+colormatrix_float_stu1_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v16.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun2   v12.8h, v13.4s, #1
+            uqxtn       v12.8b, v12.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu2_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v9.4s, #1
+            fcvtzs      v14.4s, v16.4s, #1
+            fcvtzs      v15.4s, v17.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun    v13.4h, v13.4s, #1
+            sqrshrun    v14.4h, v14.4s, #1
+            sqrshrun    v15.4h, v15.4s, #1
+            zip1        v12.8h, v12.8h, v13.8h
+            zip1        v13.8h, v14.8h, v15.8h
+            uqxtn       v12.8b, v12.8h
+            uqxtn2      v12.16b, v13.8h
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu3_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu4_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            uqxtn       v15.8b, v27.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf1_end:
+            tbz         x2, #2, 1f
+            st1         {v16.4s}, [x0], #16
+1:          tbz         x2, #1, 1f
+            st1         {v8.d}[1], [x0], #8
+1:          tbz         x2, #0, 1f
+            st1         {v8.s}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf2_end:
+            tbz         x2, #2, 1f
+            st2         {v16.4s, v17.4s}, [x0], #32
+1:          tbz         x2, #1, 1f
+            st2         {v8.s,v9.s}[2], [x0], #8
+            st2         {v8.s,v9.s}[3], [x0], #8
+1:          tbz         x2, #0, 1f
+            st2         {v8.s,v9.s}[1], [x0], #8
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf3_end:
+            movi        v11.16b, #0
+            movi        v19.16b, #0
+colormatrix_float_stf4_end:
+            tbz         x2, #2, 1f
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+1:          tbz         x2, #1, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
+            st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
+1:          tbz         x2, #0, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
+1:          b           colormatrix_float_realend
+
+colormatrix_float_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[1], [x1], #1
+1:          uxtl        v15.8h, v15.8b
+            uxtl        v12.4s, v15.4h
+            uxtl2       v20.4s, v15.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+colormatrix_float_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uxtl        v14.8h, v15.8b
+            uxtl2       v15.8h, v15.16b
+            uzp1        v12.8h, v14.8h, v14.8h
+            uzp2        v13.8h, v14.8h, v14.8h
+            uzp1        v20.8h, v15.8h, v15.8h
+            uzp2        v21.8h, v15.8h, v15.8h
+            uxtl        v12.4s, v12.4h
+            uxtl        v13.4s, v13.4h
+            uxtl        v20.4s, v20.4h
+            uxtl        v21.4s, v21.4h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+colormatrix_float_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_float_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+colormatrix_float_ldf1_end:
+            tbz         x2, #2, 1f
+            ld1         {v20.4s}, [x1], #16
+1:          tbz         x2, #1, 1f
+            ld1         {v12.d}[1], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld1         {v12.s}[1], [x1], #4
+1:          br          x4
+
+colormatrix_float_ldf2_end:
+            tbz         x2, #2, 1f
+            ld2         {v20.4s,v21.4s}, [x1], #32
+1:          tbz         x2, #1, 1f
+            ld2         {v12.s,v13.s}[2], [x1], #8
+            ld2         {v12.s,v13.s}[3], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld2         {v12.s,v13.s}[1], [x1], #8
+1:          br          x4
+
+colormatrix_float_ldf3_end:
+colormatrix_float_ldf4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+1:          tbz         x2, #1, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
+1:          tbz         x2, #0, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
+1:          br          x4
+
+/* void rsdIntrinsicColorMatrix_int_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          int16_t const *mult,    // x4
+ *          int32_t const *add);    // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_int_K)
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.8h,v1.8h}, [x4], #32
+            ld1         {v4.4s}, [x5], #16
+
+            ldp         x4,x5, [x3],#16
+            ldp         x6,x7, [x3],#16
+            ldp         x8,x9, [x3],#16
+
+            dup         v12.4s, v4.s[0]
+            dup         v13.4s, v4.s[1]
+            dup         v14.4s, v4.s[2]
+            dup         v15.4s, v4.s[3]
+            sqshrun     v8.4h, v12.4s, #8
+            sqshrun2    v8.8h, v12.4s, #8
+            sqshrun     v9.4h, v13.4s, #8
+            sqshrun2    v9.8h, v13.4s, #8
+            sqshrun     v10.4h, v14.4s, #8
+            sqshrun2    v10.8h, v14.4s, #8
+            sqshrun     v11.4h, v15.4s, #8
+            sqshrun2    v11.8h, v15.4s, #8
+
+            subs        x2, x2, #8
+            blo         colormatrix_int_end
+            br          x9
+
+colormatrix_int_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8, x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_int_realend:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ret
+END(rsdIntrinsicColorMatrix_int_K)
+
+/* void rsdIntrinsicColorMatrixSetup_int_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
+            adrp        x7, 2f
+            add         x7, x7, :lo12:2f
+            add         x4, x7, x2, LSL #3
+            ldrsw       x2, [x4], #4
+            ldrsw       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adrp        x7, 3f
+            add         x7, x7, :lo12:3f
+            add         x5, x7, x3, LSL #3
+            ldrsw       x3, [x5], #4
+            ldrsw       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
+            stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adrp        x7, 4f
+            add         x7, x7, :lo12:4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #4
+            ldrsw       x2, [x7, x2]
+            add         x2, x2, x7
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x7, x7, #4
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+END(rsdIntrinsicColorMatrixSetup_int_K)
+.rodata
+            .align 4
+2:          .word      colormatrix_int_stu1-2b
+            .word      colormatrix_int_stu1_end-2b
+            .word      colormatrix_int_stu2-2b
+            .word      colormatrix_int_stu2_end-2b
+            .word      colormatrix_int_stu3-2b
+            .word      colormatrix_int_stu3_end-2b
+            .word      colormatrix_int_stu4-2b
+            .word      colormatrix_int_stu4_end-2b
+3:          .word      colormatrix_int_ldu1-3b
+            .word      colormatrix_int_ldu1_end-3b
+            .word      colormatrix_int_ldu2-3b
+            .word      colormatrix_int_ldu2_end-3b
+            .word      colormatrix_int_ldu3-3b
+            .word      colormatrix_int_ldu3_end-3b
+            .word      colormatrix_int_ldu4-3b
+            .word      colormatrix_int_ldu4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .word      colormatrix_int_col0_\i-4b
+            .word      colormatrix_int_col1_\i-4b-4
+            .word      colormatrix_int_col2_\i-4b-8
+            .word      colormatrix_int_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .word      colormatrix_int_col0_n\i-4b
+            .word      colormatrix_int_col1_n\i-4b-4
+            .word      colormatrix_int_col2_n\i-4b-8
+            .word      colormatrix_int_col3_n\i-4b-12
+.endr
+
+
+/* void rsdIntrinsicColorMatrix_float_K(
+ *          void *out,              // x0
+ *          void const *in,         // x1
+ *          size_t count,           // x2
+ *          fntab_t const *fns,     // x3
+ *          float const *mult,      // x4
+ *          float const *add);      // x5
+ */
+ENTRY(rsdIntrinsicColorMatrix_float_K)
+            sub         x7, sp, #32
+            sub         sp, sp, #64
+            st1         {v8.1d-v11.1d}, [sp]
+            st1         {v12.1d-v15.1d}, [x7]
+
+            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
+            ld1r        {v4.4s}, [x5], #4
+            ld1r        {v5.4s}, [x5], #4
+            ld1r        {v6.4s}, [x5], #4
+            ld1r        {v7.4s}, [x5], #4
+
+            ldp         x4,x5, [x3], #16
+            ldp         x6,x7, [x3], #16
+            ldp         x8,x9, [x3], #16
+
+            mov         v8.16b, v4.16b
+            mov         v9.16b, v5.16b
+            mov         v10.16b, v6.16b
+            mov         v11.16b, v7.16b
+
+            mov         v16.16b, v4.16b
+            mov         v17.16b, v5.16b
+            mov         v18.16b, v6.16b
+            mov         v19.16b, v7.16b
+
+            subs        x2, x2, #8
+            blo         colormatrix_float_end
+            br          x9
+
+colormatrix_float_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8,x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_float_realend:
+            ld1         {v8.1d-v11.1d}, [sp], #32
+            ld1         {v12.1d-v15.1d}, [sp], #32
+            ret
+END(rsdIntrinsicColorMatrix_float_K)
+
+/* void rsdIntrinsicColorMatrixSetup_float_K(
+ *          fntab_t const *fns, // x0
+ *          uint32_t mask,      // x1
+ *          int dt,             // x2
+ *          int st);            // x3
+ */
+ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
+            adrp        x7, 2f
+            add         x7, x7, :lo12:2f
+            add         x4, x7, x2, LSL #3
+            ldrsw       x2, [x4], #4
+            ldrsw       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adrp        x7, 3f
+            add         x7, x7, :lo12:3f
+            add         x5, x7, x3, LSL #3
+            ldrsw       x3, [x5], #4
+            ldrsw       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
+            stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
+
+/* For each column function, if the matrix is all zeroes then write NULL,
+ * otherwise look up the appropriate function and store that. */
+
+            mov         x3, #4
+            adrp        x7, 4f
+            add         x7, x7, :lo12:4f
+1:          ands        x2, x1, #15
+            beq         9f
+            and         x2, x1, #31
+            lsl         x2, x2, #4
+            ldrsw       x2, [x7, x2]
+            add         x2, x2, x7
+9:          str         x2, [x0], #8
+            lsr         x1, x1, #5
+            add         x7, x7, #4
+            subs        x3, x3, #1
+            bne         1b
+
+/* For every NULL entry, copy the non-NULL entry that follows it, or the store
+ * function. */
+
+            ldr         x2, [x0]
+            mov         x3, #4
+1:          ldr         x1, [x0, #-8]!
+            cmp         x1, #0
+            csel        x2, x1, x2, ne
+            str         x2, [x0]
+            subs        x3, x3, #1
+            bne         1b
+            ret
+
+END(rsdIntrinsicColorMatrixSetup_float_K)
+.rodata
+            .align 4
+2:          .word      colormatrix_float_stu1-2b
+            .word      colormatrix_float_stu1_end-2b
+            .word      colormatrix_float_stu2-2b
+            .word      colormatrix_float_stu2_end-2b
+            .word      colormatrix_float_stu3-2b
+            .word      colormatrix_float_stu3_end-2b
+            .word      colormatrix_float_stu4-2b
+            .word      colormatrix_float_stu4_end-2b
+            .word      colormatrix_float_stf1-2b
+            .word      colormatrix_float_stf1_end-2b
+            .word      colormatrix_float_stf2-2b
+            .word      colormatrix_float_stf2_end-2b
+            .word      colormatrix_float_stf3-2b
+            .word      colormatrix_float_stf3_end-2b
+            .word      colormatrix_float_stf4-2b
+            .word      colormatrix_float_stf4_end-2b
+3:          .word      colormatrix_float_ldu1-3b
+            .word      colormatrix_float_ldu1_end-3b
+            .word      colormatrix_float_ldu2-3b
+            .word      colormatrix_float_ldu2_end-3b
+            .word      colormatrix_float_ldu3-3b
+            .word      colormatrix_float_ldu3_end-3b
+            .word      colormatrix_float_ldu4-3b
+            .word      colormatrix_float_ldu4_end-3b
+            .word      colormatrix_float_ldf1-3b
+            .word      colormatrix_float_ldf1_end-3b
+            .word      colormatrix_float_ldf2-3b
+            .word      colormatrix_float_ldf2_end-3b
+            .word      colormatrix_float_ldf3-3b
+            .word      colormatrix_float_ldf3_end-3b
+            .word      colormatrix_float_ldf4-3b
+            .word      colormatrix_float_ldf4_end-3b
+4:
+.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+            .word      colormatrix_float_col0_\i-4b
+            .word      colormatrix_float_col1_\i-4b-4
+            .word      colormatrix_float_col2_\i-4b-8
+            .word      colormatrix_float_col3_\i-4b-12
+.endr
+.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+            .word      colormatrix_float_col0_n\i-4b
+            .word      colormatrix_float_col1_n\i-4b-4
+            .word      colormatrix_float_col2_n\i-4b-8
+            .word      colormatrix_float_col3_n\i-4b-12
+.endr
diff --git a/renderscript-toolkit/src/main/cpp/ColorMatrix_neon.S b/renderscript-toolkit/src/main/cpp/ColorMatrix_neon.S
new file mode 100644
index 0000000..ecb8c13
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/ColorMatrix_neon.S
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define SNIP_START(x) \
+    .globl x; x:
+
+#define SNIP_END(x) \
+    .globl x##_end; x##_end: \
+    .globl x##_len; x##_len: \
+    .word x##_end-x
+
+SNIP_START(_N_ColorMatrix_prefix_i)
+    stmfd           sp!, {r4, lr}
+    vpush           {q4-q7}
+    vld1.16 {q2}, [r2]!
+    vld1.16 {q3}, [r2]!
+    vld1.32 {d8[],d9[]}, [r2]!
+    vld1.32 {d10[],d11[]}, [r2]!
+    vld1.32 {d12[],d13[]}, [r2]!
+    vld1.32 {d14[],d15[]}, [r2]!
+    veor q0, q0
+    veor q1, q1
+    veor q9, q9
+    veor q10, q10
+    veor q11, q11
+SNIP_END(_N_ColorMatrix_prefix_i)
+
+SNIP_START(_N_ColorMatrix_prefix_f)
+    stmfd           sp!, {r4, lr}
+    vpush           {q4-q7}
+    add r2, #48
+    vld1.32 {q4}, [r2]!
+    vld1.32 {q5}, [r2]!
+    vld1.32 {q6}, [r2]!
+    vld1.32 {q7}, [r2]!
+    vld1.32 {d16[],d17[]}, [r2]!
+    vld1.32 {d18[],d19[]}, [r2]!
+    vld1.32 {d20[],d21[]}, [r2]!
+    vld1.32 {d22[],d23[]}, [r2]!
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_prefix_f)
+
+SNIP_START(_N_ColorMatrix_postfix1)
+    subs r3, r3, #1
+    #bne 1b
+SNIP_END(_N_ColorMatrix_postfix1)
+
+SNIP_START(_N_ColorMatrix_postfix2)
+
+    #mov r0, #0
+    #ldr r0, [r0]
+
+    #vqadd.s32 q0,q0,q0
+    #vadd.f32 q0,q0,q0
+    #vmul.f32 q0,q0,d0[0]
+    #vmla.f32 q0,q0,d0[0]
+    #vmov q0, q0
+
+
+    vpop            {q4-q7}
+    ldmfd           sp!, {r4, lr}
+    bx              lr
+SNIP_END(_N_ColorMatrix_postfix2)
+
+SNIP_START(_N_ColorMatrix_load_u8_4)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_u8_4)
+
+SNIP_START(_N_ColorMatrix_load_u8_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_3)
+
+SNIP_START(_N_ColorMatrix_load_u8_2)
+    vld2.8 {d0[0],d1[0]}, [r1]!
+    vld2.8 {d0[1],d1[1]}, [r1]!
+    vld2.8 {d0[2],d1[2]}, [r1]!
+    vld2.8 {d0[3],d1[3]}, [r1]!
+    veor d2, d2
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_2)
+
+SNIP_START(_N_ColorMatrix_load_u8_1)
+    vld1.32 {d0[0]}, [r1]!
+    veor d1, d1
+    veor d2, d2
+    veor d3, d3
+SNIP_END(_N_ColorMatrix_load_u8_1)
+
+SNIP_START(_N_ColorMatrix_load_u8f_4)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    vmovl.u8 q3, d3
+    vmovl.u8 q2, d2
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q3, d6
+    vmovl.u16 q2, d4
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q3, q3
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+SNIP_END(_N_ColorMatrix_load_u8f_4)
+
+SNIP_START(_N_ColorMatrix_load_u8f_3)
+    vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [r1]!
+    vld4.8 {d0[1],d1[1],d2[1],d3[1]}, [r1]!
+    vld4.8 {d0[2],d1[2],d2[2],d3[2]}, [r1]!
+    vld4.8 {d0[3],d1[3],d2[3],d3[3]}, [r1]!
+    vmovl.u8 q2, d2
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q2, d4
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q2, q2
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_3)
+
+SNIP_START(_N_ColorMatrix_load_u8f_2)
+    vld2.8 {d0[0],d1[0]}, [r1]!
+    vld2.8 {d0[1],d1[1]}, [r1]!
+    vld2.8 {d0[2],d1[2]}, [r1]!
+    vld2.8 {d0[3],d1[3]}, [r1]!
+    vmovl.u8 q1, d1
+    vmovl.u8 q0, d0
+    vmovl.u16 q1, d2
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q1, q1
+    vcvt.f32.s32 q0, q0
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_2)
+
+SNIP_START(_N_ColorMatrix_load_u8f_1)
+    vld1.32 {d0[0]}, [r1]!
+    vmovl.u8 q0, d0
+    vmovl.u16 q0, d0
+    vcvt.f32.s32 q0, q0
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_u8f_1)
+
+SNIP_START(_N_ColorMatrix_load_f32_4)
+    vld4.32 {d0[0],d2[0],d4[0],d6[0]}, [r1]!
+    vld4.32 {d0[1],d2[1],d4[1],d6[1]}, [r1]!
+    vld4.32 {d1[0],d3[0],d5[0],d7[0]}, [r1]!
+    vld4.32 {d1[1],d3[1],d5[1],d7[1]}, [r1]!
+SNIP_END(_N_ColorMatrix_load_f32_4)
+
+SNIP_START(_N_ColorMatrix_load_f32_3)
+    vld3.32 {d0[0],d2[0],d4[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d0[1],d2[1],d4[1]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[0],d3[0],d5[0]}, [r1]!
+    add r1, r1, #4
+    vld3.32 {d1[1],d3[1],d5[1]}, [r1]!
+    add r1, r1, #4
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_3)
+
+SNIP_START(_N_ColorMatrix_load_f32_2)
+    vld2.32 {d0[0],d2[0]}, [r1]!
+    vld2.32 {d0[1],d2[1]}, [r1]!
+    vld2.32 {d1[0],d3[0]}, [r1]!
+    vld2.32 {d1[1],d3[1]}, [r1]!
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_2)
+
+SNIP_START(_N_ColorMatrix_load_f32_1)
+    vld1.32 {q0}, [r1]!
+    veor q1, q1
+    veor q2, q2
+    veor q3, q3
+SNIP_END(_N_ColorMatrix_load_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_store_u8_4)
+#mov r0, #0
+    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_4)
+
+SNIP_START(_N_ColorMatrix_store_u8_2)
+    vst2.8 {d0[0],d1[0]}, [r0]!
+    vst2.8 {d0[1],d1[1]}, [r0]!
+    vst2.8 {d0[2],d1[2]}, [r0]!
+    vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_2)
+
+SNIP_START(_N_ColorMatrix_store_u8_1)
+    vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_u8_1)
+
+
+SNIP_START(_N_ColorMatrix_store_f32u_4)
+    vcvt.s32.f32 q0, q0
+    vcvt.s32.f32 q1, q1
+    vcvt.s32.f32 q2, q2
+    vcvt.s32.f32 q3, q3
+    vqmovn.s32 d0, q0
+    vqmovn.s32 d2, q1
+    vqmovn.s32 d4, q2
+    vqmovn.s32 d6, q3
+    vqmovun.s16 d0, q0
+    vqmovun.s16 d1, q1
+    vqmovun.s16 d2, q2
+    vqmovun.s16 d3, q3
+    vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [r0]!
+    vst4.8 {d0[1],d1[1],d2[1],d3[1]}, [r0]!
+    vst4.8 {d0[2],d1[2],d2[2],d3[2]}, [r0]!
+    vst4.8 {d0[3],d1[3],d2[3],d3[3]}, [r0]!
+
+    #mov r0, #0
+    #ldr r0, [r0]
+
+SNIP_END(_N_ColorMatrix_store_f32u_4)
+
+SNIP_START(_N_ColorMatrix_store_f32u_2)
+    vcvt.s32.f32 q0, q0
+    vcvt.s32.f32 q1, q1
+    vqmovn.s32 d0, q0
+    vqmovn.s32 d2, q1
+    vqmovun.s16 d0, q0
+    vqmovun.s16 d1, q1
+    vst2.8 {d0[0],d1[0]}, [r0]!
+    vst2.8 {d0[1],d1[1]}, [r0]!
+    vst2.8 {d0[2],d1[2]}, [r0]!
+    vst2.8 {d0[3],d1[3]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_2)
+
+SNIP_START(_N_ColorMatrix_store_f32u_1)
+    vcvt.s32.f32 q0, q0
+    vqmovn.s32 d0, q0
+    vqmovun.s16 d0, q0
+    vst1.32 {d0[0]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32u_1)
+
+SNIP_START(_N_ColorMatrix_store_f32_4)
+    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_4)
+
+SNIP_START(_N_ColorMatrix_store_f32_3)
+    vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [r0]!
+    vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r0]!
+    vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [r0]!
+    vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_3)
+
+SNIP_START(_N_ColorMatrix_store_f32_2)
+    vst2.32 {d0[0],d2[0]}, [r0]!
+    vst2.32 {d0[1],d2[1]}, [r0]!
+    vst2.32 {d1[0],d3[0]}, [r0]!
+    vst2.32 {d1[1],d3[1]}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_2)
+
+SNIP_START(_N_ColorMatrix_store_f32_1)
+    vst1.32 {q0}, [r0]!
+SNIP_END(_N_ColorMatrix_store_f32_1)
+
+
+SNIP_START(_N_ColorMatrix_unpack_u8_4)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+    vmovl.u8 q15, d3  /* A */
+SNIP_END(_N_ColorMatrix_unpack_u8_4)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_3)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    vmovl.u8 q14, d2  /* B */
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_3)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_2)
+    vmovl.u8 q12, d0  /* R */
+    vmovl.u8 q13, d1  /* G */
+    veor q14, q14
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_2)
+
+SNIP_START(_N_ColorMatrix_unpack_u8_1)
+    vmovl.u8 q12, d0  /* R */
+    veor q13, q13
+    veor q14, q14
+    veor q15, q15
+SNIP_END(_N_ColorMatrix_unpack_u8_1)
+
+SNIP_START(_N_ColorMatrix_pack_u8_4)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqrshrn.s32 d28, q10, #8
+    vqrshrn.s32 d30, q11, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+    vqmovun.s16 d3, q15
+SNIP_END(_N_ColorMatrix_pack_u8_4)
+
+SNIP_START(_N_ColorMatrix_pack_u8_3)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqrshrn.s32 d28, q10, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+    vqmovun.s16 d2, q14
+SNIP_END(_N_ColorMatrix_pack_u8_3)
+
+SNIP_START(_N_ColorMatrix_pack_u8_2)
+    vqrshrn.s32 d24, q8, #8
+    vqrshrn.s32 d26, q9, #8
+    vqmovun.s16 d0, q12
+    vqmovun.s16 d1, q13
+SNIP_END(_N_ColorMatrix_pack_u8_2)
+
+SNIP_START(_N_ColorMatrix_pack_u8_1)
+    vqrshrn.s32 d24, q8, #8
+    vqmovun.s16 d0, q12
+SNIP_END(_N_ColorMatrix_pack_u8_1)
+
+SNIP_START(_N_ColorMatrix_dot)
+    vmov.u8 d1, d0
+    vmov.u8 d2, d0
+SNIP_END(_N_ColorMatrix_dot)
+
diff --git a/renderscript-toolkit/src/main/cpp/Convolve3x3.cpp b/renderscript-toolkit/src/main/cpp/Convolve3x3.cpp
new file mode 100644
index 0000000..8dd9935
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve3x3.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Convolve3x3"
+
+namespace renderscript {
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
+                                          const int16_t* coef, uint32_t count);
+
+class Convolve3x3Task : public Task {
+    const void* mIn;
+    void* mOut;
+    // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
+    // the SIMD instructions can load them in chunks multiple of 8.
+    float mFp[16];
+    int16_t mIp[16];
+
+    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+                  const uchar* py2);
+    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    size_t startX, size_t startY, size_t endX, size_t endY);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    const float* coefficients, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+        for (int ct = 0; ct < 9; ct++) {
+            mFp[ct] = coefficients[ct];
+            if (mFp[ct] >= 0) {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+            } else {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+            }
+        }
+    }
+};
+
+/**
+ * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
+ * uchar3, and uchar4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
+ * @tparam ComputationType Type we use for the intermediate computations.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType, typename ComputationType>
+static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+                         int32_t sizeX) {
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+
+    ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
+                         convert<ComputationType>(py0[x]) * coeff[1] +
+                         convert<ComputationType>(py0[x2]) * coeff[2] +
+                         convert<ComputationType>(py1[x1]) * coeff[3] +
+                         convert<ComputationType>(py1[x]) * coeff[4] +
+                         convert<ComputationType>(py1[x2]) * coeff[5] +
+                         convert<ComputationType>(py2[x1]) * coeff[6] +
+                         convert<ComputationType>(py2[x]) * coeff[7] +
+                         convert<ComputationType>(py2[x2]) * coeff[8];
+
+    px = clamp(px + 0.5f, 0.f, 255.f);
+    *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+/**
+ * Computes one convolution and stores the result in the output. This is used for float, float2,
+ * float3, and float4 vectors.
+ *
+ * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
+ * @param x The index in the row of the value we'll convolve.
+ * @param out The location in the output array where we store the value.
+ * @param py0 The start of the top row.
+ * @param py1 The start of the middle row.
+ * @param py2 The start of the bottom row.
+ * @param coeff Pointer to the float coefficients, in row major format.
+ * @param sizeX The number of cells of one row.
+ */
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
+                         int32_t sizeX) {
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
+    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
+           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
+           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the previous line.
+ * @param ppy1 Points to the start of the current line.
+ * @param ppy2 Points to the start of the next line.
+ */
+void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
+                               const uchar* ppy1, const uchar* ppy2) {
+    uchar4* out = (uchar4*)pout;
+    const uchar4* py0 = (const uchar4*)ppy0;
+    const uchar4* py1 = (const uchar4*)ppy1;
+    const uchar4* py2 = (const uchar4*)ppy2;
+
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+    if (x1 == 0) {
+        convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
+        x1++;
+        out++;
+    }
+
+    if (x2 > x1) {
+#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
+        if (mUsesSimd) {
+            int32_t len = (x2 - x1 - 1) >> 1;
+            if (len > 0) {
+                rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
+                x1 += len << 1;
+                out += len << 1;
+            }
+        }
+#endif
+
+        while (x1 != x2) {
+            convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
+            out++;
+            x1++;
+        }
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename T>
+void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
+                                              uint32_t currentY, size_t sizeX, size_t sizeY,
+                                              size_t vectorSize, float* fp) {
+    const uchar* pin = (const uchar*)in;
+    const size_t stride = sizeX * vectorSize * 4;  // float takes 4 bytes
+
+    uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
+    uint32_t y2 = std::max((int32_t)currentY - 1, 0);
+    const T* py0 = (const T*)(pin + stride * y2);
+    const T* py1 = (const T*)(pin + stride * currentY);
+    const T* py2 = (const T*)(pin + stride * y1);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                      size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
+    const size_t stride = vectorSize * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+        size_t offset = (y * sizeX + startX) * vectorSize;
+        InputOutputType* px = (InputOutputType*)(pout + offset);
+        InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
+        InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
+        InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
+        for (uint32_t x = startX; x < endX; x++, px++) {
+            convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
+        }
+    }
+}
+
+void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
+                                 size_t endY) {
+    const size_t stride = paddedSize(vectorSize) * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y2 = std::max((int32_t)y - 1, 0);
+
+        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+        uchar* px = pout + offset;
+        const uchar* py0 = pin + stride * y2;
+        const uchar* py1 = pin + stride * y;
+        const uchar* py2 = pin + stride * y1;
+        kernelU4(px, startX, endX, py0, py1, py2);
+    }
+}
+
+void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+    // endX, endY);
+    switch (mVectorSize) {
+        case 1:
+            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                    startX, startY, endX, endY, mFp);
+            break;
+        case 2:
+            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                      startX, startY, endX, endY, mFp);
+            break;
+        case 3:
+        case 4:
+            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+                       endX, endY);
+            break;
+    }
+}
+
+void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
+                                      size_t sizeY, const float* coefficients,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp b/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp
new file mode 100644
index 0000000..6731bf4
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve5x5.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Convolve5x5"
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void* dst, const void* y0, const void* y1, const void* y2,
+                                          const void* y3, const void* y4, const int16_t* coef,
+                                          uint32_t count);
+
+class Convolve5x5Task : public Task {
+    const void* mIn;
+    void* mOut;
+    // Even though we have exactly 25 coefficients, store them in an array of size 28 so that
+    // the SIMD instructions can load them in three chunks of 8 and 1 of chunk of 4.
+    float mFp[28];
+    int16_t mIp[28];
+
+    void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
+                  const uchar* py2, const uchar* py3, const uchar* py4);
+    void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    size_t startX, size_t startY, size_t endX, size_t endY);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    Convolve5x5Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
+                    const float* coefficients, const Restriction* restriction)
+        : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
+        for (int ct = 0; ct < 25; ct++) {
+            mFp[ct] = coefficients[ct];
+            if (mFp[ct] >= 0) {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
+            } else {
+                mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
+            }
+        }
+    }
+};
+
+template <typename InputOutputType, typename ComputationType>
+static void ConvolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2,
+                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+                         int32_t width) {
+    uint32_t x0 = std::max((int32_t)x - 2, 0);
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+    ComputationType px = convert<ComputationType>(py0[x0]) * coeff[0] +
+                         convert<ComputationType>(py0[x1]) * coeff[1] +
+                         convert<ComputationType>(py0[x2]) * coeff[2] +
+                         convert<ComputationType>(py0[x3]) * coeff[3] +
+                         convert<ComputationType>(py0[x4]) * coeff[4] +
+
+                         convert<ComputationType>(py1[x0]) * coeff[5] +
+                         convert<ComputationType>(py1[x1]) * coeff[6] +
+                         convert<ComputationType>(py1[x2]) * coeff[7] +
+                         convert<ComputationType>(py1[x3]) * coeff[8] +
+                         convert<ComputationType>(py1[x4]) * coeff[9] +
+
+                         convert<ComputationType>(py2[x0]) * coeff[10] +
+                         convert<ComputationType>(py2[x1]) * coeff[11] +
+                         convert<ComputationType>(py2[x2]) * coeff[12] +
+                         convert<ComputationType>(py2[x3]) * coeff[13] +
+                         convert<ComputationType>(py2[x4]) * coeff[14] +
+
+                         convert<ComputationType>(py3[x0]) * coeff[15] +
+                         convert<ComputationType>(py3[x1]) * coeff[16] +
+                         convert<ComputationType>(py3[x2]) * coeff[17] +
+                         convert<ComputationType>(py3[x3]) * coeff[18] +
+                         convert<ComputationType>(py3[x4]) * coeff[19] +
+
+                         convert<ComputationType>(py4[x0]) * coeff[20] +
+                         convert<ComputationType>(py4[x1]) * coeff[21] +
+                         convert<ComputationType>(py4[x2]) * coeff[22] +
+                         convert<ComputationType>(py4[x3]) * coeff[23] +
+                         convert<ComputationType>(py4[x4]) * coeff[24];
+    px = clamp(px + 0.5f, 0.f, 255.f);
+    *out = convert<InputOutputType>(px);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+template <typename InputOutputType>
+static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
+                         const InputOutputType* py1, const InputOutputType* py2,
+                         const InputOutputType* py3, const InputOutputType* py4, const float* coeff,
+                         int32_t width) {
+    uint32_t x0 = std::max((int32_t)x - 2, 0);
+    uint32_t x1 = std::max((int32_t)x - 1, 0);
+    uint32_t x2 = x;
+    uint32_t x3 = std::min((int32_t)x + 1, width - 1);
+    uint32_t x4 = std::min((int32_t)x + 2, width - 1);
+
+    InputOutputType px = py0[x0] * coeff[0] + py0[x1] * coeff[1] + py0[x2] * coeff[2] +
+                         py0[x3] * coeff[3] + py0[x4] * coeff[4] +
+
+                         py1[x0] * coeff[5] + py1[x1] * coeff[6] + py1[x2] * coeff[7] +
+                         py1[x3] * coeff[8] + py1[x4] * coeff[9] +
+
+                         py2[x0] * coeff[10] + py2[x1] * coeff[11] + py2[x2] * coeff[12] +
+                         py2[x3] * coeff[13] + py2[x4] * coeff[14] +
+
+                         py3[x0] * coeff[15] + py3[x1] * coeff[16] + py3[x2] * coeff[17] +
+                         py3[x3] * coeff[18] + py3[x4] * coeff[19] +
+
+                         py4[x0] * coeff[20] + py4[x1] * coeff[21] + py4[x2] * coeff[22] +
+                         py4[x3] * coeff[23] + py4[x4] * coeff[24];
+    *out = px;
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/**
+ * This function convolves one line.
+ *
+ * @param pout Where to place the next output.
+ * @param xstart Index in the X direction of where to start.
+ * @param xend End index
+ * @param ppy0 Points to the start of the line two above.
+ * @param ppy1 Points to the start of the line one above.
+ * @param ppy2 Points to the start of the current line.
+ * @param ppy3 Points to the start of the line one below.
+ * @param ppy4 Points to the start of the line two below.
+ */
+void Convolve5x5Task::kernelU4(uchar* pout, uint32_t x1, uint32_t x2, const uchar* ppy0,
+                               const uchar* ppy1, const uchar* ppy2, const uchar* ppy3,
+                               const uchar* ppy4) {
+    uchar4* out = (uchar4*)pout;
+    const uchar4* py0 = (const uchar4*)ppy0;
+    const uchar4* py1 = (const uchar4*)ppy1;
+    const uchar4* py2 = (const uchar4*)ppy2;
+    const uchar4* py3 = (const uchar4*)ppy3;
+    const uchar4* py4 = (const uchar4*)ppy4;
+
+    while ((x1 < x2) && (x1 < 2)) {
+        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+        out++;
+        x1++;
+    }
+#if defined(ARCH_X86_HAVE_SSSE3)
+    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
+    // 3 for end boundary where x may hit the end boundary)
+    if (mUsesSimd && ((x1 + 6) < x2)) {
+        // subtract 3 for end boundary
+        uint32_t len = (x2 - x1 - 3) >> 2;
+        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+                                  py4 + x1 - 2, mIp, len);
+        out += len << 2;
+        x1 += len << 2;
+    }
+#endif
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && ((x1 + 3) < x2)) {
+        uint32_t len = (x2 - x1 - 3) >> 1;
+        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2,
+                                  py4 + x1 - 2, mIp, len);
+        out += len << 1;
+        x1 += len << 1;
+    }
+#endif
+
+    while (x1 < x2) {
+        ConvolveOneU<uchar4, float4>(x1, out, py0, py1, py2, py3, py4, mFp, mSizeX);
+        out++;
+        x1++;
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+// This will need more cleanup before it can be used.
+void Convolve5x5Task::kernelF4(const ConvolveInfo* info, float4* out,
+                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float4* py0 = (const float4*)(pin + stride * y0);
+    const float4* py1 = (const float4*)(pin + stride * y1);
+    const float4* py2 = (const float4*)(pin + stride * y2);
+    const float4* py3 = (const float4*)(pin + stride * y3);
+    const float4* py4 = (const float4*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float4>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF2(const ConvolveInfo* info, float2* out,
+                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float2* py0 = (const float2*)(pin + stride * y0);
+    const float2* py1 = (const float2*)(pin + stride * y1);
+    const float2* py2 = (const float2*)(pin + stride * y2);
+    const float2* py3 = (const float2*)(pin + stride * y3);
+    const float2* py4 = (const float2*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float2>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+
+void RsdCpuScriptIntrinsicConvolve5x5_kernelF1(const ConvolveInfo* info, float* out,
+                                               uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar* pin = (const uchar*)info->in;
+    const size_t stride = info->stride;
+
+    uint32_t y0 = std::max((int32_t)currentY - 2, 0);
+    uint32_t y1 = std::max((int32_t)currentY - 1, 0);
+    uint32_t y2 = currentY;
+    uint32_t y3 = std::min((int32_t)currentY + 1, sizeY);
+    uint32_t y4 = std::min((int32_t)currentY + 2, sizeY);
+
+    const float* py0 = (const float*)(pin + stride * y0);
+    const float* py1 = (const float*)(pin + stride * y1);
+    const float* py2 = (const float*)(pin + stride * y2);
+    const float* py3 = (const float*)(pin + stride * y3);
+    const float* py4 = (const float*)(pin + stride * y4);
+
+    for (uint32_t x = xstart; x < xend; x++, out++) {
+        ConvolveOneF<float>(x, out, py0, py1, py2, py3, py4, mFp, sizeX);
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+template <typename InputOutputType, typename ComputationType>
+static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
+                      size_t startX, size_t startY, size_t endX, size_t endY, float* mFp) {
+    const size_t stride = vectorSize * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y0 = std::max((int32_t)y - 2, 0);
+        uint32_t y1 = std::max((int32_t)y - 1, 0);
+        uint32_t y2 = y;
+        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+        size_t offset = (y * sizeX + startX) * vectorSize;
+        InputOutputType* px = (InputOutputType*)(pout + offset);
+        InputOutputType* py0 = (InputOutputType*)(pin + stride * y0);
+        InputOutputType* py1 = (InputOutputType*)(pin + stride * y1);
+        InputOutputType* py2 = (InputOutputType*)(pin + stride * y2);
+        InputOutputType* py3 = (InputOutputType*)(pin + stride * y3);
+        InputOutputType* py4 = (InputOutputType*)(pin + stride * y4);
+        for (uint32_t x = startX; x < endX; x++, px++) {
+            ConvolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, py3, py4, mFp,
+                                                           sizeX);
+        }
+    }
+}
+
+void Convolve5x5Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
+                                 size_t sizeY, size_t startX, size_t startY, size_t endX,
+                                 size_t endY) {
+    const size_t stride = paddedSize(vectorSize) * sizeX;
+    for (size_t y = startY; y < endY; y++) {
+        uint32_t y0 = std::max((int32_t)y - 2, 0);
+        uint32_t y1 = std::max((int32_t)y - 1, 0);
+        uint32_t y2 = y;
+        uint32_t y3 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
+        uint32_t y4 = std::min((int32_t)y + 2, (int32_t)(sizeY - 1));
+
+        size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
+        uchar* px = pout + offset;
+        const uchar* py0 = pin + stride * y0;
+        const uchar* py1 = pin + stride * y1;
+        const uchar* py2 = pin + stride * y2;
+        const uchar* py3 = pin + stride * y3;
+        const uchar* py4 = pin + stride * y4;
+        kernelU4(px, startX, endX, py0, py1, py2, py3, py4);
+    }
+}
+
+void Convolve5x5Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                                  size_t endY) {
+    // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
+    // endX, endY);
+    switch (mVectorSize) {
+        case 1:
+            convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                    startX, startY, endX, endY, mFp);
+            break;
+        case 2:
+            convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
+                                      startX, startY, endX, endY, mFp);
+            break;
+        case 3:
+        case 4:
+            convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
+                       endX, endY);
+            break;
+    }
+}
+
+void RenderScriptToolkit::convolve5x5(const void* in, void* out, size_t vectorSize, size_t sizeX,
+                                      size_t sizeY, const float* coefficients,
+                                      const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    Convolve5x5Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Convolve_advsimd.S b/renderscript-toolkit/src/main/cpp/Convolve_advsimd.S
new file mode 100644
index 0000000..0daa0c5
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve_advsimd.S
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2012,2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = coeffs
+        x5 = length / 2
+*/
+
+#define ENTRY(f) .text; .align 2; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        sub             x6, sp, #64
+        sub             sp, sp, #64
+        st1             {v8.1d-v11.1d}, [x6], #32
+        st1             {v12.1d-v15.1d}, [x6]
+
+        /* Load the coefficients in the v0, v1 registers */
+        ld1     {v0.8h, v1.8h}, [x4]
+
+        /* Load the frequently used immediate in a register */
+        mov x4, #8
+
+1:
+        /* Load and post-increase the address by x4=#8 */
+        ld1     {v13.16b}, [x1], x4
+        ld1     {v14.16b}, [x2], x4
+        ld1     {v15.16b}, [x3], x4
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x4] // TODO: test this
+//        prfm        PLDL1KEEP,[x3, x4] // TODO: test this
+
+        uxtl      v2.8h, v13.8b
+        uxtl2     v3.8h, v13.16b
+        uxtl      v4.8h, v14.8b
+        uxtl2     v5.8h, v14.16b
+        uxtl      v6.8h, v15.8b
+        uxtl2     v7.8h, v15.16b
+
+/*
+        The two pixel source array is
+        v2,  v2hi,  v3lo,  v3hi
+        v4,  v4hi,  v5lo, v5hi
+        v6, v6hi, v7lo, v7hi
+*/
+
+        smull     v8.4s, v2.4h, v0.h[0]
+        smull2    v9.4s, v2.8h, v0.h[0]
+        smlal2    v8.4s, v2.8h, v0.h[1]
+        smlal     v9.4s, v3.4h, v0.h[1]
+        smlal     v8.4s, v3.4h, v0.h[2]
+        smlal2    v9.4s, v3.8h, v0.h[2]
+        smlal     v8.4s, v4.4h, v0.h[3]
+        smlal2    v9.4s, v4.8h, v0.h[3]
+        smlal2    v8.4s, v4.8h, v0.h[4]
+        smlal     v9.4s, v5.4h, v0.h[4]
+        smlal     v8.4s, v5.4h, v0.h[5]
+        smlal2    v9.4s, v5.8h, v0.h[5]
+        smlal     v8.4s, v6.4h, v0.h[6]
+        smlal2    v9.4s, v6.8h, v0.h[6]
+        smlal2    v8.4s, v6.8h, v0.h[7]
+        smlal     v9.4s, v7.4h, v0.h[7]
+        smlal     v8.4s, v7.4h, v1.h[0]
+        smlal2    v9.4s, v7.8h, v1.h[0]
+
+        shrn      v8.4h, v8.4s, #8
+        shrn2     v8.8h, v9.4s, #8
+
+        sqxtun      v8.8b, v8.8h
+        st1         {v8.8b}, [x0], #8
+
+        /* Are we done yet? */
+        subs x5, x5, #1
+        bne 1b
+
+        /* We're done, bye! */
+        ld1             {v8.1d-v11.1d}, [sp], #32
+        ld1             {v12.1d-v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        x0 = dst
+        x1 = y0 base pointer
+        x2 = y1 base pointer
+        x3 = y2 base pointer
+        x4 = y3 base pointer
+        x5 = y4 base pointer
+        x6 = coeffs
+        x7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        sub         x8, sp, #64
+        sub         sp, sp, #64
+        st1         {v8.1d-v11.1d}, [x8], #32
+        st1         {v12.1d-v15.1d}, [x8]
+
+        /* Create the coefficients vector  */
+        ld1         {v0.8h-v2.8h}, [x6], #48
+        ld1         {v3.4h}, [x6], #8
+
+        movi      v15.4s, #0x7f
+
+        /* Load the frequently used immediate in a register */
+        mov     x6, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x1], x6      //  y0 ( y - 2 )
+        ld1     {v12.8b-v14.8b}, [x2], x6      //  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x1, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x2, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smull     v4.4s, v9.4h, v0.h[0]
+        smull2    v5.4s, v9.8h, v0.h[0]
+        smlal2    v4.4s, v9.8h, v0.h[1]
+        smlal     v5.4s, v10.4h, v0.h[1]
+        smlal     v4.4s, v10.4h, v0.h[2]
+        smlal2    v5.4s, v10.8h, v0.h[2]
+        smlal2    v4.4s, v10.8h, v0.h[3]
+        smlal     v5.4s, v11.4h, v0.h[3]
+        smlal     v4.4s, v11.4h, v0.h[4]
+        smlal2    v5.4s, v11.8h, v0.h[4]
+
+        smlal     v4.4s, v12.4h, v0.h[5]
+        smlal2    v5.4s, v12.8h, v0.h[5]
+        smlal2    v4.4s, v12.8h, v0.h[6]
+        smlal     v5.4s, v13.4h, v0.h[6]
+        smlal     v4.4s, v13.4h, v0.h[7]
+        smlal2    v5.4s, v13.8h, v0.h[7]
+        smlal2    v4.4s, v13.8h, v1.h[0]
+        smlal     v5.4s, v14.4h, v1.h[0]
+        smlal     v4.4s, v14.4h, v1.h[1]
+        smlal2    v5.4s, v14.8h, v1.h[1]
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b-v11.8b}, [x3], x6      //  y0 ( y )
+        ld1     {v12.8b-v14.8b}, [x4], x6      //  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x3, x6] // TODO: test this
+//        prfm        PLDL1KEEP,[x4, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+        uxtl      v12.8h, v12.8b
+        uxtl      v13.8h, v13.8b
+        uxtl      v14.8h, v14.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+        smlal     v4.4s, v9.4h, v1.h[2]
+        smlal2    v5.4s, v9.8h, v1.h[2]
+        smlal2    v4.4s, v9.8h, v1.h[3]
+        smlal     v5.4s, v10.4h, v1.h[3]
+        smlal     v4.4s, v10.4h, v1.h[4]
+        smlal2    v5.4s, v10.8h, v1.h[4]
+        smlal2    v4.4s, v10.8h, v1.h[5]
+        smlal     v5.4s, v11.4h, v1.h[5]
+        smlal     v4.4s, v11.4h, v1.h[6]
+        smlal2    v5.4s, v11.8h, v1.h[6]
+
+        smlal     v4.4s, v12.4h, v1.h[7]
+        smlal2    v5.4s, v12.8h, v1.h[7]
+        smlal2    v4.4s, v12.8h, v2.h[0]
+        smlal     v5.4s, v13.4h, v2.h[0]
+        smlal     v4.4s, v13.4h, v2.h[1]
+        smlal2    v5.4s, v13.8h, v2.h[1]
+        smlal2    v4.4s, v13.8h, v2.h[2]
+        smlal     v5.4s, v14.4h, v2.h[2]
+        smlal     v4.4s, v14.4h, v2.h[3]
+        smlal2    v5.4s, v14.8h, v2.h[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by x6=#8 */
+        ld1     {v9.8b- v11.8b}, [x5], x6      //  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+//        prfm        PLDL1KEEP,[x5, x6] // TODO: test this
+
+        /* Promoting the 8bit channels to 16bit */
+        uxtl      v9.8h,  v9.8b
+        uxtl      v10.8h, v10.8b
+        uxtl      v11.8h, v11.8b
+
+/*
+        v9,  v9hi,  v10lo, v10hi, v11lo, v11hi,
+        v12,  v12hi
+*/
+
+        smlal     v4.4s, v9.4h, v2.h[4]
+        smlal2    v5.4s, v9.8h, v2.h[4]
+        smlal2    v4.4s, v9.8h, v2.h[5]
+        smlal     v5.4s, v10.4h, v2.h[5]
+        smlal     v4.4s, v10.4h, v2.h[6]
+        smlal2    v5.4s, v10.8h, v2.h[6]
+        smlal2    v4.4s, v10.8h, v2.h[7]
+        smlal     v5.4s, v11.4h, v2.h[7]
+        smlal     v4.4s, v11.4h, v3.h[0]
+        smlal2    v5.4s, v11.8h, v3.h[0]
+
+        add      v4.4s, v4.4s, v15.4s
+        add      v5.4s, v5.4s, v15.4s
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        rshrn      v4.4h, v4.4s, #8
+        rshrn2     v4.8h, v5.4s, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        sqxtun      v4.8b, v4.8h
+
+        st1     {v4.8b}, [x0], #8        // return the output and increase the address of x0
+
+        /* Are we done? */
+        subs x7, x7, #1
+        bne 1b
+
+        /* Yup, bye */
+        ld1         {v8.1d-v11.1d}, [sp], #32
+        ld1         {v12.1d-v15.1d}, [sp], #32
+        ret
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/renderscript-toolkit/src/main/cpp/Convolve_neon.S b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
new file mode 100644
index 0000000..ee10884
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Convolve_neon.S
@@ -0,0 +1,287 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        sp = coeffs
+        sp = length / 2
+*/
+
+#define ENTRY(f) .text; .align 0; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+ENTRY(rsdIntrinsicConvolve3x3_K)
+        push            {r4-r8, r10, r11, lr}
+        vpush           {q4-q7}
+
+        /* Get the coeffs pointer from the stack and load the
+           coefficients in the q0, q1 NEON registers */
+        ldr r4, [sp, #32+64]
+        vld1.16 {q0, q1}, [r4]
+
+        /* Get count from the stack */
+        ldr r4, [sp, #36+64]
+
+        /* Load the frequently used immediate in a register */
+        mov r5, #8
+
+1:
+        /* Load and post-increase the address by r5=#8 */
+        vld1.8 {q13}, [r1], r5
+        vld1.8 {q14}, [r2], r5
+        vld1.8 {q15}, [r3], r5
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r5]
+        pld         [r2, r5]
+        pld         [r3, r5]
+
+        vmovl.u8 q2, d26
+        vmovl.u8 q3, d27
+        vmovl.u8 q4, d28
+        vmovl.u8 q5, d29
+        vmovl.u8 q6, d30
+        vmovl.u8 q7, d31
+
+/*
+        The two pixel source array is
+        d4,  d5,  d6,  d7
+        d8,  d9,  d10, d11
+        d12, d13, d14, d15
+*/
+
+        vmull.s16 q8, d4, d0[0]
+        vmlal.s16 q8, d5, d0[1]
+        vmlal.s16 q8, d6, d0[2]
+        vmlal.s16 q8, d8, d0[3]
+        vmlal.s16 q8, d9, d1[0]
+        vmlal.s16 q8, d10, d1[1]
+        vmlal.s16 q8, d12, d1[2]
+        vmlal.s16 q8, d13, d1[3]
+        vmlal.s16 q8, d14, d2[0]
+
+        vmull.s16 q9, d5, d0[0]
+        vmlal.s16 q9, d6, d0[1]
+        vmlal.s16 q9, d7, d0[2]
+        vmlal.s16 q9, d9, d0[3]
+        vmlal.s16 q9, d10, d1[0]
+        vmlal.s16 q9, d11, d1[1]
+        vmlal.s16 q9, d13, d1[2]
+        vmlal.s16 q9, d14, d1[3]
+        vmlal.s16 q9, d15, d2[0]
+
+        vshrn.i32 d16, q8, #8
+        vshrn.i32 d17, q9, #8
+
+        vqmovun.s16 d16, q8
+        vst1.8 d16, [r0]!
+
+        /* Are we done yet? */
+        subs r4, r4, #1
+        bne 1b
+
+        /* We're done, bye! */
+        vpop            {q4-q7}
+        pop             {r4-r8, r10, r11, lr}
+        bx              lr
+END(rsdIntrinsicConvolve3x3_K)
+
+
+/* Convolve 5x5 */
+
+/*
+        r0 = dst
+        r1 = y0 base pointer
+        r2 = y1 base pointer
+        r3 = y2 base pointer
+        r4 = y3 base pointer
+        r5 = y4 base pointer
+        r6 = coeffs
+        r7 = length
+*/
+ENTRY(rsdIntrinsicConvolve5x5_K)
+        push        {r4-r7, lr}
+        vpush       {q4-q7}
+
+        /* load y3 in r4 */
+        ldr     r4, [sp, #20 + 64]
+
+        /* load y4 in r5 */
+        ldr     r5, [sp, #24 + 64]
+
+        /* Load the coefficients pointer */
+        ldr     r6, [sp, #28 + 64]
+
+        /* Create the coefficients vector */
+        vld1.16     {d0, d1, d2, d3}, [r6]!
+        vld1.16     {d4, d5, d6}, [r6]
+
+        vmov.u32  q15, #0x7f
+
+        /* load the count */
+        ldr     r6, [sp, #32 + 64]
+
+        /* Load the frequently used immediate in a register */
+        mov     r7, #8
+
+1:
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r1], r7      @  y0 ( y - 2 )
+        vld1.8  {d27, d28, d29}, [r2], r7      @  y0 ( y - 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r1, r7]
+        pld         [r2, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmull.s16 q4, d18, d0[0]
+        vmlal.s16 q4, d19, d0[1]
+        vmlal.s16 q4, d20, d0[2]
+        vmlal.s16 q4, d21, d0[3]
+        vmlal.s16 q4, d22, d1[0]
+
+        vmlal.s16 q4, d24, d1[1]
+        vmlal.s16 q4, d25, d1[2]
+        vmlal.s16 q4, d26, d1[3]
+        vmlal.s16 q4, d27, d2[0]
+        vmlal.s16 q4, d28, d2[1]
+
+        vmull.s16 q5, d19, d0[0]
+        vmlal.s16 q5, d20, d0[1]
+        vmlal.s16 q5, d21, d0[2]
+        vmlal.s16 q5, d22, d0[3]
+        vmlal.s16 q5, d23, d1[0]
+
+        vmlal.s16 q5, d25, d1[1]
+        vmlal.s16 q5, d26, d1[2]
+        vmlal.s16 q5, d27, d1[3]
+        vmlal.s16 q5, d28, d2[0]
+        vmlal.s16 q5, d29, d2[1]
+
+
+        /* Next 2 rows */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r3], r7      @  y0 ( y )
+        vld1.8  {d27, d28, d29}, [r4], r7      @  y0 ( y + 1 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r3, r7]
+        pld         [r4, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+        vmovl.u8 q12, d27
+        vmovl.u8 q13, d28
+        vmovl.u8 q14, d29
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+        vmlal.s16 q4, d18, d2[2]
+        vmlal.s16 q4, d19, d2[3]
+        vmlal.s16 q4, d20, d3[0]
+        vmlal.s16 q4, d21, d3[1]
+        vmlal.s16 q4, d22, d3[2]
+
+        vmlal.s16 q4, d24, d3[3]
+        vmlal.s16 q4, d25, d4[0]
+        vmlal.s16 q4, d26, d4[1]
+        vmlal.s16 q4, d27, d4[2]
+        vmlal.s16 q4, d28, d4[3]
+
+        vmlal.s16 q5, d19, d2[2]
+        vmlal.s16 q5, d20, d2[3]
+        vmlal.s16 q5, d21, d3[0]
+        vmlal.s16 q5, d22, d3[1]
+        vmlal.s16 q5, d23, d3[2]
+
+        vmlal.s16 q5, d25, d3[3]
+        vmlal.s16 q5, d26, d4[0]
+        vmlal.s16 q5, d27, d4[1]
+        vmlal.s16 q5, d28, d4[2]
+        vmlal.s16 q5, d29, d4[3]
+
+        /* Last row */
+        /* Load the y base pointers in Qregs and post-increase the address by r7=#8 */
+        vld1.8  {d24, d25, d26}, [r5], r7      @  y0 ( y + 2 )
+
+        /* Signal memory for data that will be used in the loop after the next */
+        pld         [r5, r7]
+
+        /* Promoting the 8bit channels to 16bit */
+        vmovl.u8 q9,  d24
+        vmovl.u8 q10, d25
+        vmovl.u8 q11, d26
+
+/*
+        d18,  d19,  d20, d21, d22, d23,
+        d24,  d25
+*/
+
+        vmlal.s16 q4, d18, d5[0]
+        vmlal.s16 q4, d19, d5[1]
+        vmlal.s16 q4, d20, d5[2]
+        vmlal.s16 q4, d21, d5[3]
+        vmlal.s16 q4, d22, d6[0]
+
+        vmlal.s16 q5, d19, d5[0]
+        vmlal.s16 q5, d20, d5[1]
+        vmlal.s16 q5, d21, d5[2]
+        vmlal.s16 q5, d22, d5[3]
+        vmlal.s16 q5, d23, d6[0]
+
+
+
+        vadd.i32 q4, q4, q15
+        vadd.i32 q5, q5, q15
+
+/*      Narrow it to a d-reg 32 -> 16 bit */
+        vrshrn.i32 d8, q4, #8
+        vrshrn.i32 d9, q5, #8
+
+
+/*      Pack 16 -> 8 bit, saturate, put two pixels into D reg */
+        vqmovun.s16 d8, q4
+
+        vst1.8 d8, [r0]!           @ return the output and increase the address of r0
+
+        /* Are we done? */
+        subs r6, r6, #1
+        bne 1b
+
+        /* Yup, bye */
+        vpop        {q4-q7}
+        pop         {r4-r7, lr}
+        bx          lr
+
+END(rsdIntrinsicConvolve5x5_K)
diff --git a/renderscript-toolkit/src/main/cpp/Histogram.cpp b/renderscript-toolkit/src/main/cpp/Histogram.cpp
new file mode 100644
index 0000000..9c7ea90
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Histogram.cpp
@@ -0,0 +1,297 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <array>
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Histogram"
+
+namespace renderscript {
+
+class HistogramTask : public Task {
+    const uchar* mIn;
+    std::vector<int> mSums;
+    uint32_t mThreadCount;
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+    void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+   public:
+    HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                  uint32_t threadCount, const Restriction* restriction);
+    void collateSums(int* out);
+};
+
+class HistogramDotTask : public Task {
+    const uchar* mIn;
+    float mDot[4];
+    int mDotI[4];
+    std::vector<int> mSums;
+    uint32_t mThreadCount;
+
+    void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+    void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend);
+
+   public:
+    HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                     uint32_t threadCount, const float* coefficients,
+                     const Restriction* restriction);
+    void collateSums(int* out);
+
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+};
+
+HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                             uint32_t threadCount, const Restriction* restriction)
+    : Task{sizeX, sizeY, vectorSize, true, restriction},
+      mIn{in},
+      mSums(256 * paddedSize(vectorSize) * threadCount) {
+    mThreadCount = threadCount;
+}
+
+void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                                size_t endY) {
+    typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &HistogramTask::kernelP1U4;
+            break;
+        case 3:
+            kernel = &HistogramTask::kernelP1U3;
+            break;
+        case 2:
+            kernel = &HistogramTask::kernelP1U2;
+            break;
+        case 1:
+            kernel = &HistogramTask::kernelP1U1;
+            break;
+        default:
+            ALOGE("Bad vector size %zd", mVectorSize);
+            return;
+    }
+
+    int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex];
+
+    for (size_t y = startY; y < endY; y++) {
+        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+        std::invoke(kernel, this, inPtr, sums, startX, endX);
+    }
+}
+
+void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 2)]++;
+        sums[(in[1] << 2) + 1]++;
+        sums[(in[2] << 2) + 2]++;
+        sums[(in[3] << 2) + 3]++;
+        in += 4;
+    }
+}
+
+void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 2)]++;
+        sums[(in[1] << 2) + 1]++;
+        sums[(in[2] << 2) + 2]++;
+        in += 4;
+    }
+}
+
+void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[(in[0] << 1)]++;
+        sums[(in[1] << 1) + 1]++;
+        in += 2;
+    }
+}
+
+void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        sums[in[0]]++;
+        in++;
+    }
+}
+
+void HistogramTask::collateSums(int* out) {
+    for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) {
+        out[ct] = mSums[ct];
+        for (uint32_t t = 1; t < mThreadCount; t++) {
+            out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)];
+        }
+    }
+}
+
+HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize,
+                                   uint32_t threadCount, const float* coefficients,
+                                   const Restriction* restriction)
+    : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) {
+    mThreadCount = threadCount;
+
+    if (coefficients == nullptr) {
+        mDot[0] = 0.299f;
+        mDot[1] = 0.587f;
+        mDot[2] = 0.114f;
+        mDot[3] = 0;
+    } else {
+        memcpy(mDot, coefficients, 16);
+    }
+    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
+    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
+    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
+    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
+}
+
+void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                                   size_t endY) {
+    typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &HistogramDotTask::kernelP1L4;
+            break;
+        case 3:
+            kernel = &HistogramDotTask::kernelP1L3;
+            break;
+        case 2:
+            kernel = &HistogramDotTask::kernelP1L2;
+            break;
+        case 1:
+            kernel = &HistogramDotTask::kernelP1L1;
+            break;
+        default:
+            ALOGI("Bad vector size %zd", mVectorSize);
+            return;
+    }
+
+    int* sums = &mSums[256 * threadIndex];
+
+    for (size_t y = startY; y < endY; y++) {
+        const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize);
+        std::invoke(kernel, this, inPtr, sums, startX, endX);
+    }
+}
+
+void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 4;
+    }
+}
+
+void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 4;
+    }
+}
+
+void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]);
+        sums[(t + 0x7f) >> 8]++;
+        in += 2;
+    }
+}
+
+void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) {
+    for (uint32_t x = xstart; x < xend; x++) {
+        int t = (mDotI[0] * in[0]);
+        sums[(t + 0x7f) >> 8]++;
+        in++;
+    }
+}
+
+void HistogramDotTask::collateSums(int* out) {
+    for (uint32_t ct = 0; ct < 256; ct++) {
+        out[ct] = mSums[ct];
+        for (uint32_t t = 1; t < mThreadCount; t++) {
+            out[ct] += mSums[ct + (256 * t)];
+        }
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////
+
+void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+                                    size_t vectorSize, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction);
+    processor->doTask(&task);
+    task.collateSums(out);
+}
+
+void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY,
+                                       size_t vectorSize, const float* coefficients,
+                                       const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+    if (coefficients != nullptr) {
+        float sum = 0.0f;
+        for (size_t i = 0; i < vectorSize; i++) {
+            if (coefficients[i] < 0.0f) {
+                ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.",
+                      i, coefficients[i]);
+                return;
+            }
+            sum += coefficients[i];
+        }
+        if (sum > 1.0f) {
+            ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum);
+            return;
+        }
+    }
+#endif
+
+    HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(),
+                          coefficients, restriction);
+    processor->doTask(&task);
+    task.collateSums(out);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/JniEntryPoints.cpp b/renderscript-toolkit/src/main/cpp/JniEntryPoints.cpp
new file mode 100644
index 0000000..185c752
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/JniEntryPoints.cpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <android/bitmap.h>
+#include <cassert>
+#include <jni.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.JniEntryPoints"
+
+using namespace renderscript;
+
+/**
+ * I compared using env->GetPrimitiveArrayCritical vs. env->GetByteArrayElements to get access
+ * to the underlying data. On Pixel 4, it's actually faster to not use critical. The code is left
+ * here if you want to experiment. Note that USE_CRITICAL could block the garbage collector.
+ */
+// #define USE_CRITICAL
+
+class ByteArrayGuard {
+   private:
+    JNIEnv* env;
+    jbyteArray array;
+    jbyte* data;
+
+   public:
+    ByteArrayGuard(JNIEnv* env, jbyteArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jbyte*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetByteArrayElements(array, nullptr);
+#endif
+    }
+    ~ByteArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseByteArrayElements(array, data, 0);
+#endif
+    }
+    uint8_t* get() { return reinterpret_cast<uint8_t*>(data); }
+};
+
+class IntArrayGuard {
+   private:
+    JNIEnv* env;
+    jintArray array;
+    jint* data;
+
+   public:
+    IntArrayGuard(JNIEnv* env, jintArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jint*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetIntArrayElements(array, nullptr);
+#endif
+    }
+    ~IntArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseIntArrayElements(array, data, 0);
+#endif
+    }
+    int* get() { return reinterpret_cast<int*>(data); }
+};
+
+class FloatArrayGuard {
+   private:
+    JNIEnv* env;
+    jfloatArray array;
+    jfloat* data;
+
+   public:
+    FloatArrayGuard(JNIEnv* env, jfloatArray array) : env{env}, array{array} {
+#ifdef USE_CRITICAL
+        data = reinterpret_cast<jfloat*>(env->GetPrimitiveArrayCritical(array, nullptr));
+#else
+        data = env->GetFloatArrayElements(array, nullptr);
+#endif
+    }
+    ~FloatArrayGuard() {
+#ifdef USE_CRITICAL
+        env->ReleasePrimitiveArrayCritical(array, data, 0);
+#else
+        env->ReleaseFloatArrayElements(array, data, 0);
+#endif
+    }
+    float* get() { return reinterpret_cast<float*>(data); }
+};
+
+class BitmapGuard {
+   private:
+    JNIEnv* env;
+    jobject bitmap;
+    AndroidBitmapInfo info;
+    int bytesPerPixel;
+    void* bytes;
+    bool valid;
+
+   public:
+    BitmapGuard(JNIEnv* env, jobject jBitmap) : env{env}, bitmap{jBitmap}, bytes{nullptr} {
+        valid = false;
+        if (AndroidBitmap_getInfo(env, bitmap, &info) != ANDROID_BITMAP_RESULT_SUCCESS) {
+            ALOGE("AndroidBitmap_getInfo failed");
+            return;
+        }
+        if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888 &&
+            info.format != ANDROID_BITMAP_FORMAT_A_8) {
+            ALOGE("AndroidBitmap in the wrong format");
+            return;
+        }
+        bytesPerPixel = info.stride / info.width;
+        if (bytesPerPixel != 1 && bytesPerPixel != 4) {
+            ALOGE("Expected a vector size of 1 or 4. Got %d. Extra padding per line not currently "
+                  "supported",
+                  bytesPerPixel);
+            return;
+        }
+        if (AndroidBitmap_lockPixels(env, bitmap, &bytes) != ANDROID_BITMAP_RESULT_SUCCESS) {
+            ALOGE("AndroidBitmap_lockPixels failed");
+            return;
+        }
+        valid = true;
+    }
+    ~BitmapGuard() {
+        if (valid) {
+            AndroidBitmap_unlockPixels(env, bitmap);
+        }
+    }
+    uint8_t* get() const {
+        assert(valid);
+        return reinterpret_cast<uint8_t*>(bytes);
+    }
+    int width() const { return info.width; }
+    int height() const { return info.height; }
+    int vectorSize() const { return bytesPerPixel; }
+};
+
+/**
+ * Copies the content of Kotlin Range2d object into the equivalent C++ struct.
+ */
+class RestrictionParameter {
+   private:
+    bool isNull;
+    Restriction restriction;
+
+   public:
+    RestrictionParameter(JNIEnv* env, jobject jRestriction) : isNull{jRestriction == nullptr} {
+        if (isNull) {
+            return;
+        }
+        /* TODO Measure how long FindClass and related functions take. Consider passing the
+         * four values instead. This would also require setting the default when Range2D is null.
+         */
+        jclass restrictionClass = env->FindClass("com/google/android/renderscript/Range2d");
+        if (restrictionClass == nullptr) {
+            ALOGE("RenderScriptToolit. Internal error. Could not find the Kotlin Range2d class.");
+            isNull = true;
+            return;
+        }
+        jfieldID startXId = env->GetFieldID(restrictionClass, "startX", "I");
+        jfieldID startYId = env->GetFieldID(restrictionClass, "startY", "I");
+        jfieldID endXId = env->GetFieldID(restrictionClass, "endX", "I");
+        jfieldID endYId = env->GetFieldID(restrictionClass, "endY", "I");
+        restriction.startX = env->GetIntField(jRestriction, startXId);
+        restriction.startY = env->GetIntField(jRestriction, startYId);
+        restriction.endX = env->GetIntField(jRestriction, endXId);
+        restriction.endY = env->GetIntField(jRestriction, endYId);
+    }
+    Restriction* get() { return isNull ? nullptr : &restriction; }
+};
+
+extern "C" JNIEXPORT jlong JNICALL
+Java_com_google_android_renderscript_Toolkit_createNative(JNIEnv* /*env*/, jobject /*thiz*/) {
+    return reinterpret_cast<jlong>(new RenderScriptToolkit());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_destroyNative(
+        JNIEnv* /*env*/, jobject /*thiz*/, jlong native_handle) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    delete toolkit;
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeBlend(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jbyteArray source_array,
+        jbyteArray dest_array, jint size_x, jint size_y, jobject restriction) {
+    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard source{env, source_array};
+    ByteArrayGuard dest{env, dest_array};
+
+    toolkit->blend(mode, source.get(), dest.get(), size_x, size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeBlendBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jint jmode, jobject source_bitmap,
+        jobject dest_bitmap, jobject restriction) {
+    auto toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    auto mode = static_cast<RenderScriptToolkit::BlendingMode>(jmode);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard source{env, source_bitmap};
+    BitmapGuard dest{env, dest_bitmap};
+
+    toolkit->blend(mode, source.get(), dest.get(), source.width(), source.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeBlur(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+        jint size_x, jint size_y, jint radius, jbyteArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->blur(input.get(), output.get(), size_x, size_y, vectorSize, radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeBlurBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jint radius, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+
+    toolkit->blur(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                  radius, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeColorMatrix(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint input_vector_size, jint size_x, jint size_y, jbyteArray output_array,
+        jint output_vector_size, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    FloatArrayGuard matrix{env, jmatrix};
+    FloatArrayGuard add{env, add_vector};
+
+    toolkit->colorMatrix(input.get(), output.get(), input_vector_size, output_vector_size, size_x,
+                         size_y, matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeColorMatrixBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jfloatArray jmatrix, jfloatArray add_vector, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    FloatArrayGuard matrix{env, jmatrix};
+    FloatArrayGuard add{env, add_vector};
+
+    toolkit->colorMatrix(input.get(), output.get(), input.vectorSize(), output.vectorSize(),
+                         input.width(), input.height(), matrix.get(), add.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeConvolve(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint vectorSize,
+        jint size_x, jint size_y, jbyteArray output_array, jfloatArray coefficients,
+        jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    switch (env->GetArrayLength(coefficients)) {
+        case 9:
+            toolkit->convolve3x3(input.get(), output.get(), vectorSize, size_x, size_y,
+                                 coeffs.get(), restrict.get());
+            break;
+        case 25:
+            toolkit->convolve5x5(input.get(), output.get(), vectorSize, size_x, size_y,
+                                 coeffs.get(), restrict.get());
+            break;
+    }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeConvolveBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    switch (env->GetArrayLength(coefficients)) {
+        case 9:
+            toolkit->convolve3x3(input.get(), output.get(), input.vectorSize(), input.width(),
+                                 input.height(), coeffs.get(), restrict.get());
+            break;
+        case 25:
+            toolkit->convolve5x5(input.get(), output.get(), input.vectorSize(), input.width(),
+                                 input.height(), coeffs.get(), restrict.get());
+            break;
+    }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeHistogram(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint size_x, jint size_y, jintArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    IntArrayGuard output{env, output_array};
+
+    toolkit->histogram(input.get(), output.get(), size_x, size_y, vector_size, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeHistogramBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jintArray output_array, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    IntArrayGuard output{env, output_array};
+
+    toolkit->histogram(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                       restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeHistogramDot(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint size_x, jint size_y, jintArray output_array,
+        jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    IntArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    toolkit->histogramDot(input.get(), output.get(), size_x, size_y, vector_size, coeffs.get(),
+                          restrict.get());
+}
+
+extern "C" JNIEXPORT
+void JNICALL Java_com_google_android_renderscript_Toolkit_nativeHistogramDotBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jintArray output_array, jfloatArray coefficients, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    IntArrayGuard output{env, output_array};
+    FloatArrayGuard coeffs{env, coefficients};
+
+    toolkit->histogramDot(input.get(), output.get(), input.width(), input.height(),
+                          input.vectorSize(), coeffs.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeLut(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jbyteArray red_table,
+        jbyteArray green_table, jbyteArray blue_table, jbyteArray alpha_table,
+        jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    ByteArrayGuard red{env, red_table};
+    ByteArrayGuard green{env, green_table};
+    ByteArrayGuard blue{env, blue_table};
+    ByteArrayGuard alpha{env, alpha_table};
+
+    toolkit->lut(input.get(), output.get(), size_x, size_y, red.get(), green.get(), blue.get(),
+                 alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeLutBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jbyteArray red_table, jbyteArray green_table, jbyteArray blue_table,
+        jbyteArray alpha_table, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard red{env, red_table};
+    ByteArrayGuard green{env, green_table};
+    ByteArrayGuard blue{env, blue_table};
+    ByteArrayGuard alpha{env, alpha_table};
+
+    toolkit->lut(input.get(), output.get(), input.width(), input.height(), red.get(), green.get(),
+                 blue.get(), alpha.get(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeLut3d(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jbyteArray cube_values, jint cubeSizeX,
+        jint cubeSizeY, jint cubeSizeZ, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+    ByteArrayGuard cube{env, cube_values};
+
+    toolkit->lut3d(input.get(), output.get(), size_x, size_y, cube.get(), cubeSizeX, cubeSizeY,
+                   cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeLut3dBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jbyteArray cube_values, jint cubeSizeX, jint cubeSizeY,
+        jint cubeSizeZ, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard cube{env, cube_values};
+
+    toolkit->lut3d(input.get(), output.get(), input.width(), input.height(), cube.get(), cubeSizeX,
+                   cubeSizeY, cubeSizeZ, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeResize(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jint vector_size, jint input_size_x, jint input_size_y, jbyteArray output_array,
+        jint output_size_x, jint output_size_y, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->resize(input.get(), output.get(), input_size_x, input_size_y, vector_size,
+                    output_size_x, output_size_y, restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeResizeBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jobject input_bitmap,
+        jobject output_bitmap, jobject restriction) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    RestrictionParameter restrict {env, restriction};
+    BitmapGuard input{env, input_bitmap};
+    BitmapGuard output{env, output_bitmap};
+
+    toolkit->resize(input.get(), output.get(), input.width(), input.height(), input.vectorSize(),
+                    output.width(), output.height(), restrict.get());
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeYuvToRgb(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array,
+        jbyteArray output_array, jint size_x, jint size_y, jint format) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    ByteArrayGuard input{env, input_array};
+    ByteArrayGuard output{env, output_array};
+
+    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+                      static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
+
+extern "C" JNIEXPORT void JNICALL Java_com_google_android_renderscript_Toolkit_nativeYuvToRgbBitmap(
+        JNIEnv* env, jobject /*thiz*/, jlong native_handle, jbyteArray input_array, jint size_x,
+        jint size_y, jobject output_bitmap, jint format) {
+    RenderScriptToolkit* toolkit = reinterpret_cast<RenderScriptToolkit*>(native_handle);
+    BitmapGuard output{env, output_bitmap};
+    ByteArrayGuard input{env, input_array};
+
+    toolkit->yuvToRgb(input.get(), output.get(), size_x, size_y,
+                      static_cast<RenderScriptToolkit::YuvFormat>(format));
+}
diff --git a/renderscript-toolkit/src/main/cpp/Lut.cpp b/renderscript-toolkit/src/main/cpp/Lut.cpp
new file mode 100644
index 0000000..f064d29
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Lut.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.Lut"
+
+namespace renderscript {
+
+class LutTask : public Task {
+    const uchar4* mIn;
+    uchar4* mOut;
+    const uchar* mRedTable;
+    const uchar* mGreenTable;
+    const uchar* mBlueTable;
+    const uchar* mAlphaTable;
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    LutTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY, const uint8_t* red,
+            const uint8_t* green, const uint8_t* blue, const uint8_t* alpha,
+            const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mIn{reinterpret_cast<const uchar4*>(input)},
+          mOut{reinterpret_cast<uchar4*>(output)},
+          mRedTable{red},
+          mGreenTable{green},
+          mBlueTable{blue},
+          mAlphaTable{alpha} {}
+};
+
+void LutTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                          size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        const uchar4* in = mIn + offset;
+        uchar4* out = mOut + offset;
+        for (size_t x = startX; x < endX; x++) {
+            auto v = *in;
+            *out = uchar4{mRedTable[v.x], mGreenTable[v.y], mBlueTable[v.z], mAlphaTable[v.w]};
+            in++;
+            out++;
+        }
+    }
+}
+
+void RenderScriptToolkit::lut(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                              const uint8_t* red, const uint8_t* green, const uint8_t* blue,
+                              const uint8_t* alpha, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    LutTask task(input, output, sizeX, sizeY, red, green, blue, alpha, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Lut3d.cpp b/renderscript-toolkit/src/main/cpp/Lut3d.cpp
new file mode 100644
index 0000000..8c950e0
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Lut3d.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Lut3d"
+
+/**
+ * Converts a RGBA buffer using a 3D cube.
+ */
+class Lut3dTask : public Task {
+    // The input array we're transforming.
+    const uchar4* mIn;
+    // Where we'll store the transformed result.
+    uchar4* mOut;
+    // The size of each of the three cube dimensions. We don't make use of the last value.
+    int4 mCubeDimension;
+    // The translation cube, in row major format.
+    const uchar* mCubeTable;
+
+    /**
+     * Converts a subset of a line of the 2D buffer.
+     *
+     * @param in The start of the data to transform.
+     * @param out Where to store the result.
+     * @param length The number of 4-byte vectors to transform.
+     */
+    void kernel(const uchar4* in, uchar4* out, uint32_t length);
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    Lut3dTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+              const uint8_t* cube, int cubeSizeX, int cubeSizeY, int cubeSizeZ,
+              const Restriction* restriction)
+        : Task{sizeX, sizeY, 4, true, restriction},
+          mIn{reinterpret_cast<const uchar4*>(input)},
+          mOut{reinterpret_cast<uchar4*>(output)},
+          mCubeDimension{cubeSizeX, cubeSizeY, cubeSizeZ, 0},
+          mCubeTable{cube} {}
+};
+
+extern "C" void rsdIntrinsic3DLUT_K(void* dst, void const* in, size_t count, void const* lut,
+                                    int32_t pitchy, int32_t pitchz, int dimx, int dimy, int dimz);
+
+void Lut3dTask::kernel(const uchar4* in, uchar4* out, uint32_t length) {
+    uint32_t x1 = 0;
+    uint32_t x2 = length;
+
+    const uchar* bp = mCubeTable;
+
+    int4 dims = mCubeDimension - 1;
+
+    const float4 m = (float4)(1.f / 255.f) * convert<float4>(dims);
+    const int4 coordMul = convert<int4>(m * (float4)0x8000);
+    const size_t stride_y = mCubeDimension.x * 4;
+    const size_t stride_z = stride_y * mCubeDimension.y;
+
+    // ALOGE("strides %zu %zu", stride_y, stride_z);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd) {
+        int32_t len = x2 - x1;
+        if (len > 0) {
+            rsdIntrinsic3DLUT_K(out, in, len, bp, stride_y, stride_z, dims.x, dims.y, dims.z);
+            x1 += len;
+            out += len;
+            in += len;
+        }
+    }
+#endif
+
+    while (x1 < x2) {
+        int4 baseCoord = convert<int4>(*in) * coordMul;
+        int4 coord1 = baseCoord >> (int4)15;
+        // int4 coord2 = min(coord1 + 1, gDims - 1);
+
+        int4 weight2 = baseCoord & 0x7fff;
+        int4 weight1 = (int4)0x8000 - weight2;
+
+        // ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+        const uchar* bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
+        const uchar4* pt_00 = (const uchar4*)&bp2[0];
+        const uchar4* pt_10 = (const uchar4*)&bp2[stride_y];
+        const uchar4* pt_01 = (const uchar4*)&bp2[stride_z];
+        const uchar4* pt_11 = (const uchar4*)&bp2[stride_y + stride_z];
+
+        uint4 v000 = convert<uint4>(pt_00[0]);
+        uint4 v100 = convert<uint4>(pt_00[1]);
+        uint4 v010 = convert<uint4>(pt_10[0]);
+        uint4 v110 = convert<uint4>(pt_10[1]);
+        uint4 v001 = convert<uint4>(pt_01[0]);
+        uint4 v101 = convert<uint4>(pt_01[1]);
+        uint4 v011 = convert<uint4>(pt_11[0]);
+        uint4 v111 = convert<uint4>(pt_11[1]);
+
+        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
+        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
+        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
+        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
+
+        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
+        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
+
+        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
+        uint4 v2 = (v + 0x7f) >> (int4)8;
+
+        uchar4 ret = convert<uchar4>(v2);
+        ret.w = in->w;
+
+#if 0
+        if (!x1) {
+            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
+            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z,
+                  baseCoord.w);
+            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
+            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
+            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
+
+            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
+            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
+            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
+            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
+
+            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
+            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
+        }
+#endif
+        *out = ret;
+
+        in++;
+        out++;
+        x1++;
+    }
+}
+
+void Lut3dTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                            size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        kernel(mIn + offset, mOut + offset, endX - startX);
+    }
+}
+
+void RenderScriptToolkit::lut3d(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                                const uint8_t* cube, size_t cubeSizeX, size_t cubeSizeY,
+                                size_t cubeSizeZ, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
+        return;
+    }
+#endif
+
+    Lut3dTask task(input, output, sizeX, sizeY, cube, cubeSizeX, cubeSizeY, cubeSizeZ, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S b/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S
new file mode 100644
index 0000000..edcb038
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Lut3d_advsimd.S
@@ -0,0 +1,250 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+
+.macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
+
+            smov        x6, \src0
+            smov        x7, \src1
+
+            add         x6, x6, x3
+            add         x7, x7, x3
+
+            ld1         {v16.2s}, [x6], x4
+            ld1         {v17.2s}, [x7], x4
+
+            ld1         {v18.2s}, [x6], x5
+            ld1         {v19.2s}, [x7], x5
+
+            dup         v8.8b, \yr0
+            dup         v9.8b, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
+            zip1        v12.16b, v5.16b, v16.16b
+            zip1        v13.16b, v5.16b, v17.16b
+            umlsl       v12.8h, v16.8b, v8.8b
+            umlsl       v13.8h, v17.8b, v9.8b
+            umlal       v12.8h, v18.8b, v8.8b
+            umlal       v13.8h, v19.8b, v9.8b
+
+            ld1         {v18.2s}, [x6]
+            ld1         {v19.2s}, [x7]
+
+            sub         x6, x6, x4
+            sub         x7, x7, x4
+
+            ld1         {v16.2s}, [x6]
+            ld1         {v17.2s}, [x7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
+            zip1        v14.16b, v5.16b, v16.16b
+            zip1        v15.16b, v5.16b, v17.16b
+            umlsl       v14.8h, v16.8b, v8.8b
+            umlsl       v15.8h, v17.8b, v9.8b
+            umlal       v14.8h, v18.8b, v8.8b
+            umlal       v15.8h, v19.8b, v9.8b
+
+            /* Z interpolate, lane 0 v12/v14 -> v10 */
+            ushll       v8.4s, v12.4h, #8
+            ushll2      v9.4s, v12.8h, #8
+            umlsl       v8.4s, v12.4h, \zr0
+            umlsl2      v9.4s, v12.8h, \zr0
+            umlal       v8.4s, v14.4h, \zr0
+            umlal2      v9.4s, v14.8h, \zr0
+            rshrn       v10.4h, v8.4s, #8
+            rshrn2      v10.8h, v9.4s, #8
+
+            /* Z interpolate, lane 1 v13/v15 -> v11 */
+            ushll       v8.4s, v13.4h, #8
+            ushll2      v9.4s, v13.8h, #8
+            umlsl       v8.4s, v13.4h, \zr1
+            umlsl2      v9.4s, v13.8h, \zr1
+            umlal       v8.4s, v15.4h, \zr1
+            umlal2      v9.4s, v15.8h, \zr1
+            rshrn       v11.4h, v8.4s, #8
+            rshrn2      v11.8h, v9.4s, #8
+
+            /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
+            ushll       v8.4s, v10.4h, #8
+            ushll       v9.4s, v11.4h, #8
+            umlsl       v8.4s, v10.4h, \xr0
+            umlsl       v9.4s, v11.4h, \xr1
+            umlal2      v8.4s, v10.8h, \xr0
+            umlal2      v9.4s, v11.8h, \xr1
+            shrn        v14.4h, v8.4s, #8
+            shrn2       v14.8h, v9.4s, #8
+
+            /* pack lanes 0-1 -> v6 */
+.ifc \dst, v20.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else ; .ifc \dst, v21.16b
+            uqrshrn2    \dst, v14.8h, #8
+.else
+            uqrshrn     \dst, v14.8h, #8
+.endif ; .endif
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ *          void *dst,          // x0
+ *          void const *in,     // x1
+ *          size_t count,       // x2
+ *          void const *lut,    // x3
+ *          int32_t pitchy,     // w4
+ *          int32_t pitchz,     // w5
+ *          int dimx,           // w6
+ *          int dimy,           // w7
+ *          int dimz);          // [sp]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            ldr         w8, [sp]
+            stp         d8, d9, [sp, #-64]!
+            stp         d10, d11, [sp, #16]
+            stp         d12, d13, [sp, #32]
+            stp         d14, d15, [sp, #48]
+            movi        v4.8b, #1
+            ins         v4.h[0], w6
+            ins         v4.h[1], w7
+            ins         v4.h[2], w8
+            ins         v4.s[2], w4
+            ins         v4.s[3], w5
+            movi        v5.16b, #0
+
+            subs        x2, x2, #8
+            bge         2f
+            cmn         x2, #8    // same as cmp x2, #-8
+            ble         9f
+            b           4f
+
+            .align 6
+1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+/* x0  = dst
+ * x1  = src
+ * x2  = count
+ * x3  = lut
+ * x4  = pitchy
+ * x5  = pitchz
+ * x6 = offset0
+ * x7 = offset1
+ */
+2:          ld4         {v0.8b-v3.8b}, [x1], #32
+/* v0,v1,v2,v3 source data
+ * v4 dimensions and pitches
+ */
+3:          uxtl        v0.8h, v0.8b
+            uxtl        v1.8h, v1.8b
+            uxtl        v2.8h, v2.8b
+            mul         v0.8h, v0.8h, v4.h[0]
+            mul         v1.8h, v1.8h, v4.h[1]
+            mul         v2.8h, v2.8h, v4.h[2]
+
+/* ursra below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            usra        v0.8h, v0.8h, #8
+            usra        v1.8h, v1.8h, #8
+            usra        v2.8h, v2.8h, #8
+
+            ushr        v12.8h, v0.8h, #8
+            ushr        v13.8h, v1.8h, #8
+            ushr        v14.8h, v2.8h, #8
+            bic         v0.8h, #0xff, LSL #8
+            xtn         v1.8b, v1.8h
+            bic         v2.8h, #0xff, LSL #8
+
+/* v0.8h,v1.8b,v2.hb fractional offset
+ * v12.8h,v13.8h,v14.8h integer offset
+ */
+
+            ushll       v6.4s, v12.4h, #2
+            ushll2      v7.4s, v12.8h, #2
+            uxtl        v8.4s, v13.4h
+            uxtl2       v9.4s, v13.8h
+            uxtl        v10.4s, v14.4h
+            uxtl2       v11.4s, v14.8h
+            mla         v6.4s, v8.4s,  v4.s[2]
+            mla         v7.4s, v9.4s,  v4.s[2]
+            mla         v6.4s, v10.4s, v4.s[3]
+            mla         v7.4s, v11.4s, v4.s[3]
+
+/* v6,v7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
+
+        /* lanes 2 and 3 */
+            lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
+
+        /* lanes 4 and 5 */
+            lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
+
+        /* lanes 6 and 7 */
+            lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
+
+            uzp1        v6.16b, v20.16b, v21.16b
+            uzp2        v7.16b, v20.16b, v21.16b
+            uzp1        v20.16b, v6.16b, v7.16b
+            uzp2        v22.16b, v6.16b, v7.16b
+            mov         v21.d[0], v20.d[1]
+
+            subs        x2, x2, #8
+            mov         v23.8b, v3.8b
+
+            bge         1b
+
+            cmn         x2, #8    // same as cmp x2, #-8
+            blt         1f
+
+            st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
+            beq         9f
+
+            /* fill the vector  with a safe value */
+4:          ld4r        {v0.8b-v3.8b}, [x1]
+            tbz         x2, #2, 2f
+            ld4         {v0.b-v3.b}[0], [x1], #4
+            ld4         {v0.b-v3.b}[1], [x1], #4
+            ld4         {v0.b-v3.b}[2], [x1], #4
+            ld4         {v0.b-v3.b}[3], [x1], #4
+2:          tbz         x2, #1, 2f
+            ld4         {v0.b-v3.b}[4], [x1], #4
+            ld4         {v0.b-v3.b}[5], [x1], #4
+2:          tbz         x2, #0, 2f
+            ld4         {v0.b-v3.b}[6], [x1], #4
+2:          b           3b
+
+1:          tst         x2, #4
+            beq         2f
+            st4         {v20.b-v23.b}[0], [x0], #4
+            st4         {v20.b-v23.b}[1], [x0], #4
+            st4         {v20.b-v23.b}[2], [x0], #4
+            st4         {v20.b-v23.b}[3], [x0], #4
+2:          tst         x2, #2
+            beq         2f
+            st4         {v20.b-v23.b}[4], [x0], #4
+            st4         {v20.b-v23.b}[5], [x0], #4
+2:          tst         x2, #1
+            beq         9f
+            st4         {v20.b-v23.b}[6], [x0], #4
+
+9:          ldp         d14, d15, [sp, #48]
+            ldp         d12, d13, [sp, #32]
+            ldp         d10, d11, [sp, #16]
+            ldp         d8, d9, [sp], #64
+            ret
+END(rsdIntrinsic3DLUT_K)
diff --git a/renderscript-toolkit/src/main/cpp/Lut3d_neon.S b/renderscript-toolkit/src/main/cpp/Lut3d_neon.S
new file mode 100644
index 0000000..9590f9c
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Lut3d_neon.S
@@ -0,0 +1,256 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+.macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
+
+            vmov        r6, r7, \src
+
+            add         r6, r6, r3
+            add         r7, r7, r3
+
+            vld1.u8     d16, [r6], r4
+            vld1.u8     d17, [r7], r4
+
+            vld1.u8     d18, [r6], r5
+            vld1.u8     d19, [r7], r5
+
+            vdup.u8     d6, \yr0
+            vdup.u8     d7, \yr1
+            /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
+            vshll.u8    q12, d16, #8
+            vshll.u8    q13, d17, #8
+            vmlsl.u8    q12, d16, d6
+            vmlsl.u8    q13, d17, d7
+            vmlal.u8    q12, d18, d6
+            vmlal.u8    q13, d19, d7
+
+            vld1.u8     d18, [r6]
+            vld1.u8     d19, [r7]
+
+            sub         r6, r6, r4
+            sub         r7, r7, r4
+
+            vld1.u8     d16, [r6]
+            vld1.u8     d17, [r7]
+
+            /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
+            vshll.u8    q14, d16, #8
+            vshll.u8    q15, d17, #8
+            vmlsl.u8    q14, d16, d6
+            vmlsl.u8    q15, d17, d7
+            vmlal.u8    q14, d18, d6
+            vmlal.u8    q15, d19, d7
+
+            /* Z interpolate, lane 0 q12/q14 -> q10 */
+            vshll.u16   q8, d24, #8
+            vshll.u16   q9, d25, #8
+            vmlsl.u16   q8, d24, \zr0
+            vmlsl.u16   q9, d25, \zr0
+            vmlal.u16   q8, d28, \zr0
+            vmlal.u16   q9, d29, \zr0
+            vrshrn.u32  d20, q8, #8
+            vrshrn.u32  d21, q9, #8
+
+            /* Z interpolate, lane 1 q13/q15 -> q11 */
+            vshll.u16   q8, d26, #8
+            vshll.u16   q9, d27, #8
+            vmlsl.u16   q8, d26, \zr1
+            vmlsl.u16   q9, d27, \zr1
+            vmlal.u16   q8, d30, \zr1
+            vmlal.u16   q9, d31, \zr1
+            vrshrn.u32  d22, q8, #8
+            vrshrn.u32  d23, q9, #8
+
+            /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
+            vshll.u16   q8, d20, #8
+            vshll.u16   q9, d22, #8
+            vmlsl.u16   q8, d20, \xr0
+            vmlsl.u16   q9, d22, \xr1
+            vmlal.u16   q8, d21, \xr0
+            vmlal.u16   q9, d23, \xr1
+            vshrn.u32   d28, q8, #8
+            vshrn.u32   d29, q9, #8
+
+            /* pack lanes 0-1 -> d12 */
+            vqrshrn.u16  \dst, q14, #8
+.endm
+
+/* void rsdIntrinsic3DLUT_K(
+ *          void *dst,          // r0
+ *          void const *in,     // r1
+ *          size_t count,       // r2
+ *          void const *lut,    // r3
+ *          int32_t pitchy,     // [sp]
+ *          int32_t pitchz,     // [sp+#4]
+ *          int dimx,           // [sp+#8]
+ *          int dimy,           // [sp+#12]
+ *          int dimz);          // [sp+#16]
+ */
+ENTRY(rsdIntrinsic3DLUT_K)
+            push        {r4,r5,r6,r7}
+            ldr         r4, [sp, #16]
+            ldr         r5, [sp, #20]
+            ldr         r6, [sp, #24]
+            ldr         r7, [sp, #28]
+            ldr         r12, [sp, #32]
+            vpush       {d8-d15}
+
+            vmov.u8     d8, #1
+            vmov.u16    d8[0], r6
+            vmov.u16    d8[1], r7
+            vmov.u16    d8[2], r12
+            vmov        d9, r4, r5
+
+            subs        r2, #8
+            bge         2f
+            cmp         r2, #-8
+            ble         9f
+            b           4f
+
+            .align 6
+1:          vst4.u8     {d12,d13,d14,d15}, [r0]!
+/* r0  = dst
+ * r1  = src
+ * r2  = count
+ * r3  = lut
+ * r4  = pitchy
+ * r5  = pitchz
+ * r6 = offset0
+ * r7 = offset1
+ */
+2:          vld4.u8     {d0,d2,d4,d6}, [r1]!
+3:          vmov        d10, d6
+/* q0,q1,q2,q5 source data
+ * q4 dimensions and pitches
+ * q3, scratch register for scalar access
+ */
+            vmov        q3, q4
+            vmovl.u8    q0, d0
+            vmovl.u8    q1, d2
+            vmovl.u8    q2, d4
+            vmul.u16    q0, q0, d6[0]
+            vmul.u16    q1, q1, d6[1]
+            vmul.u16    q2, q2, d6[2]
+
+/* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
+ * where we try to read from the limit of the array and the limit +1 to
+ * interpolate, even though the fractional component is zero.  Strictly this is
+ * correct, except for the llegal access problem.
+ */
+            vsra.u16    q0, q0, #8
+            vsra.u16    q1, q1, #8
+            vsra.u16    q2, q2, #8
+
+            vshr.u16    q12, q0, #8
+            vshr.u16    q13, q1, #8
+            vshr.u16    q14, q2, #8
+
+            vbic.u16    q0, #0xff00
+            vmovn.u16   d2, q1
+            vbic.u16    q2, #0xff00
+
+/* q0,d2,q2 fractional offset
+ * q12,q13,q14 integer offset
+ */
+
+            vshll.u16   q6, d24, #2
+            vshll.u16   q7, d25, #2
+            vmovl.u16   q8, d26
+            vmovl.u16   q9, d27
+            vmovl.u16   q10, d28
+            vmovl.u16   q11, d29
+            vmla.s32    q6, q8,  d9[0]
+            vmla.s32    q7, q9,  d9[0]
+            vmla.s32    q6, q10, d9[1]
+            vmla.s32    q7, q11, d9[1]
+
+/* q6,q7 list of table offsets */
+
+        /* lanes 0 and 1 */
+            lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
+
+        /* lanes 2 and 3 */
+            lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
+
+        /* lanes 4 and 5 */
+            lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
+
+        /* lanes 6 and 7 */
+            lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
+
+            vuzp.u8     d12, d13
+            vuzp.u8     d14, d15
+            vuzp.u8     d12, d14
+            vuzp.u8     d13, d15
+
+            subs        r2, r2, #8
+            vmov.u8     d15, d10
+
+            bge         1b
+
+            cmp         r2, #-8
+            blt         1f
+
+            vst4.u8     {d12,d13,d14,d15}, [r0]!
+
+            beq         9f
+
+            /* fill the vector with a safe value */
+4:          vld1.u32    {d0[]}, [r1]
+            vmov        d2, d0
+            vmov        d4, d0
+            vmov        d6, d0
+            tst         r2, #4
+            beq         2f
+            vld1.u32    {d0}, [r1]!
+            vld1.u32    {d2}, [r1]!
+2:          tst         r2, #2
+            beq         2f
+            vld1.u32    {d4}, [r1]!
+2:          tst         r2, #1
+            beq         2f
+            vld1.u32    {d6[0]}, [r1]!
+2:          vuzp.8      d0, d2
+            vuzp.8      d4, d6
+            vuzp.8      d0, d4
+            vuzp.8      d2, d6
+            b           3b
+
+1:          vzip.8      d12, d14
+            vzip.8      d13, d15
+            vzip.8      d12, d13
+            vzip.8      d14, d15
+            tst         r2, #4
+            beq         2f
+            vst1.u32    {d12,d13}, [r0]!
+2:          tst         r2, #2
+            beq         2f
+            vst1.u32    {d14}, [r0]!
+2:          tst         r2, #1
+            beq         9f
+            vst1.u32    {d15[0]}, [r0]!
+
+9:          mov         r0, #0
+            vpop        {d8-d15}
+            pop         {r4,r5,r6,r7}
+            bx lr
+END(rsdIntrinsic3DLUT_K)
diff --git a/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.cpp b/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.cpp
new file mode 100644
index 0000000..ae348d3
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RenderScriptToolkit.h"
+
+#include "TaskProcessor.h"
+
+#define LOG_TAG "renderscript.toolkit.RenderScriptToolkit"
+
+namespace renderscript {
+
+// You will find the implementation of the various transformations in the correspondingly
+// named source file. E.g. RenderScriptToolkit::blur() is found in Blur.cpp.
+
+RenderScriptToolkit::RenderScriptToolkit(int numberOfThreads)
+    : processor{new TaskProcessor(numberOfThreads)} {}
+
+RenderScriptToolkit::~RenderScriptToolkit() {
+    // By defining the destructor here, we don't need to include TaskProcessor.h
+    // in RenderScriptToolkit.h.
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.h b/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.h
new file mode 100644
index 0000000..5315a93
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/RenderScriptToolkit.h
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
+
+#include <cstdint>
+#include <memory>
+
+namespace renderscript {
+
+class TaskProcessor;
+
+/**
+ * Define a range of data to process.
+ *
+ * This class is used to restrict a Toolkit operation to a rectangular subset of the input
+ * tensor.
+ *
+ * @property startX The index of the first value to be included on the X axis.
+ * @property endX The index after the last value to be included on the X axis.
+ * @property startY The index of the first value to be included on the Y axis.
+ * @property endY The index after the last value to be included on the Y axis.
+ */
+struct Restriction {
+    size_t startX;
+    size_t endX;
+    size_t startY;
+    size_t endY;
+};
+
+/**
+ * A collection of high-performance graphic utility functions like blur and blend.
+ *
+ * This toolkit provides ten image manipulation functions: blend, blur, color matrix, convolve,
+ * histogram, histogramDot, lut, lut3d, resize, and YUV to RGB. These functions execute
+ * multithreaded on the CPU.
+ *
+ * These functions work over raw byte arrays. You'll need to specify the width and height of
+ * the data to be processed, as well as the number of bytes per pixel. For most use cases,
+ * this will be 4.
+ *
+ * You should instantiate the Toolkit once and reuse it throughout your application.
+ * On instantiation, the Toolkit creates a thread pool that's used for processing all the functions.
+ * You can limit the number of pool threads used by the Toolkit via the constructor. The pool
+ * threads are destroyed once the Toolkit is destroyed, after any pending work is done.
+ *
+ * This library is thread safe. You can call methods from different pool threads. The functions will
+ * execute sequentially.
+ *
+ * A Java/Kotlin Toolkit is available. It calls this library through JNI.
+ *
+ * This toolkit can be used as a replacement for most RenderScript Intrinsic functions. Compared
+ * to RenderScript, it's simpler to use and more than twice as fast on the CPU. However RenderScript
+ * Intrinsics allow more flexibility for the type of allocation supported. In particular, this
+ * toolkit does not support allocations of floats.
+ */
+class RenderScriptToolkit {
+    /** Each Toolkit method call is converted to a Task. The processor owns the thread pool. It
+     * tiles the tasks and schedule them over the pool threads.
+     */
+    std::unique_ptr<TaskProcessor> processor;
+
+   public:
+    /**
+     * Creates the pool threads that are used for processing the method calls.
+     */
+    RenderScriptToolkit(int numberOfThreads = 0);
+    /**
+     * Destroys the thread pool. This stops any in-progress work; the Toolkit methods called from
+     * other pool threads will return without having completed the work. Because of the undefined
+     * state of the output buffers, an application should avoid destroying the Toolkit if other pool
+     * threads are executing Toolkit methods.
+     */
+    ~RenderScriptToolkit();
+
+    /**
+     * Determines how a source buffer is blended into a destination buffer.
+     *
+     * See {@link RenderScriptToolkit::blend}.
+     *
+     * blend only works on 4 byte RGBA data. In the descriptions below, ".a" represents
+     * the alpha channel.
+     */
+    enum class BlendingMode {
+        /**
+         * dest = 0
+         *
+         * The destination is cleared, i.e. each pixel is set to (0, 0, 0, 0)
+         */
+        CLEAR = 0,
+        /**
+         * dest = src
+         *
+         * Sets each pixel of the destination to the corresponding one in the source.
+         */
+        SRC = 1,
+        /**
+         * dest = dest
+         *
+         * Leaves the destination untouched. This is a no-op.
+         */
+        DST = 2,
+        /**
+         * dest = src + dest * (1.0 - src.a)
+         */
+        SRC_OVER = 3,
+        /**
+         * dest = dest + src * (1.0 - dest.a)
+         */
+        DST_OVER = 4,
+        /**
+         * dest = src * dest.a
+         */
+        SRC_IN = 5,
+        /**
+         * dest = dest * src.a
+         */
+        DST_IN = 6,
+        /**
+         * dest = src * (1.0 - dest.a)
+         */
+        SRC_OUT = 7,
+        /**
+         * dest = dest * (1.0 - src.a)
+         */
+        DST_OUT = 8,
+        /**
+         * dest.rgb = src.rgb * dest.a + (1.0 - src.a) * dest.rgb, dest.a = dest.a
+         */
+        SRC_ATOP = 9,
+        /**
+         * dest = dest.rgb * src.a + (1.0 - dest.a) * src.rgb, dest.a = src.a
+         */
+        DST_ATOP = 10,
+        /**
+         * dest = {src.r ^ dest.r, src.g ^ dest.g, src.b ^ dest.b, src.a ^ dest.a}
+         *
+         * Note: this is NOT the Porter/Duff XOR mode; this is a bitwise xor.
+         */
+        XOR = 11,
+        /**
+         * dest = src * dest
+         */
+        MULTIPLY = 12,
+        /**
+         * dest = min(src + dest, 1.0)
+         */
+        ADD = 13,
+        /**
+         * dest = max(dest - src, 0.0)
+         */
+        SUBTRACT = 14
+    };
+
+    /**
+     * Blend a source buffer with the destination buffer.
+     *
+     * Blends a source buffer and a destination buffer, placing the result in the destination
+     * buffer. The blending is done pairwise between two corresponding RGBA values found in
+     * each buffer. The mode parameter specifies one of fifteen blending operations.
+     * See {@link BlendingMode}.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source and destination buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * 4 bytes. The buffers have a row-major layout.
+     *
+     * @param mode The specific blending operation to do.
+     * @param source The RGBA input buffer.
+     * @param dest The destination buffer. Used for input and output.
+     * @param sizeX The width of both buffers, as a number of RGBA values.
+     * @param sizeY The height of both buffers, as a number of RGBA values.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void blend(BlendingMode mode, const uint8_t* _Nonnull source, uint8_t* _Nonnull dst,
+               size_t sizeX, size_t sizeY, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Blur an image.
+     *
+     * Performs a Gaussian blur of the input image and stores the result in the out buffer.
+     *
+     * The radius determines which pixels are used to compute each blurred pixels. This Toolkit
+     * accepts values between 1 and 25. Larger values create a more blurred effect but also
+     * take longer to compute. When the radius extends past the edge, the edge pixel will
+     * be used as replacement for the pixel that's out off boundary.
+     *
+     * Each input pixel can either be represented by four bytes (RGBA format) or one byte
+     * for the less common blurring of alpha channel only image.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be blurred.
+     * @param out The buffer that receives the blurred image.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize Either 1 or 4, the number of bytes in each cell, i.e. A vs. RGBA.
+     * @param radius The radius of the pixels used to blur.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void blur(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+              size_t vectorSize, int radius, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Identity matrix that can be passed to the {@link RenderScriptToolkit::colorMatrix} method.
+     *
+     * Using this matrix will result in no change to the pixel through multiplication although
+     * the pixel value can still be modified by the add vector, or transformed to a different
+     * format.
+     */
+    static constexpr float kIdentityMatrix[] =  {
+            1.0f, 0.0f, 0.0f, 0.0f,
+            0.0f, 1.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 1.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 1.0f
+    };
+
+    /**
+     * Matrix to turn color pixels to a grey scale.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert an
+     * image from color to greyscale.
+     */
+    static constexpr float kGreyScaleColorMatrix[] = {
+            0.299f, 0.299f, 0.299f, 0.0f,
+            0.587f, 0.587f, 0.587f, 0.0f,
+            0.114f, 0.114f, 0.114f, 0.0f,
+            0.0f,   0.0f,   0.0f,   1.0f
+    };
+
+    /**
+     * Matrix to convert RGB to YUV.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from RGB to YUV. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method.
+     */
+    static constexpr float kRgbToYuvMatrix[] = {
+            0.299f, -0.14713f,  0.615f,   0.0f,
+            0.587f, -0.28886f, -0.51499f, 0.0f,
+            0.114f,  0.436f,   -0.10001f, 0.0f,
+            0.0f,    0.0f,      0.0f,     1.0f
+    };
+
+    /**
+     * Matrix to convert YUV to RGB.
+     *
+     * Use this matrix with the {@link RenderScriptToolkit::colorMatrix} method to convert the
+     * first three bytes of each pixel from YUV to RGB. This leaves the last byte (the alpha
+     * channel) untouched.
+     *
+     * This is a simplistic conversion. Most YUV buffers have more complicated format, not supported
+     * by this method. Use {@link RenderScriptToolkit::yuvToRgb} to convert these buffers.
+     */
+    static constexpr float kYuvToRgbMatrix[] = {
+            1.0f,      1.0f,     1.0f,     0.0f,
+            0.0f,     -0.39465f, 2.03211f, 0.0f,
+            1.13983f, -0.5806f,  0.0f,     0.0f,
+            0.0f,      0.0f,     0.0f,     1.0f
+    };
+
+    /**
+     * Transform an image using a color matrix.
+     *
+     * Converts a 2D array of vectors of unsigned bytes, multiplying each vectors by a 4x4 matrix
+     * and adding an optional vector.
+     *
+     * Each input vector is composed of 1-4 unsigned bytes. If less than 4 bytes, it's extended to
+     * 4, padding with zeroes. The unsigned bytes are converted from 0-255 to 0.0-1.0 floats
+     * before the multiplication is done.
+     *
+     * The resulting value is normalized from 0.0-1.0 to a 0-255 value and stored in the output.
+     * If the output vector size is less than four, the unused channels are discarded.
+     *
+     * If addVector is null, a vector of zeroes is added, i.e. a noop.
+     *
+     * Check kIdentityMatrix, kGreyScaleColorMatrix, kRgbToYuvMatrix, and kYuvToRgbMatrix for sample
+     * matrices. The YUV conversion may not work for all color spaces.
+     *
+     * @param in The buffer of the image to be converted.
+     * @param out The buffer that receives the converted image.
+     * @param inputVectorSize The number of bytes in each input cell, a value from 1 to 4.
+     * @param outputVectorSize The number of bytes in each output cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 to 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 to 4 byte cells.
+     * @param matrix The 4x4 matrix to multiply, in row major format.
+     * @param addVector A vector of four floats that's added to the result of the multiplication.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void colorMatrix(const void* _Nonnull in, void* _Nonnull out, size_t inputVectorSize,
+                     size_t outputVectorSize, size_t sizeX, size_t sizeY,
+                     const float* _Nonnull matrix, const float* _Nullable addVector = nullptr,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Convolve a ByteArray.
+     *
+     * Applies a 3x3 or 5x5 convolution to the input array using the provided coefficients.
+     *
+     * For 3x3 convolutions, 9 coefficients must be provided. For 5x5, 25 coefficients are needed.
+     * The coefficients should be provided in row-major format.
+     *
+     * When the square extends past the edge, the edge values will be used as replacement for the
+     * values that's are off boundary.
+     *
+     * Each input cell can either be represented by one to four bytes. Each byte is multiplied
+     * and accumulated independently of the other bytes of the cell.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be blurred.
+     * @param out The buffer that receives the blurred image.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param sizeX The width of both buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 1 or 4 byte cells.
+     * @param coefficients 9 or 25 multipliers.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void convolve3x3(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+                     size_t sizeY, const float* _Nonnull coefficients,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    void convolve5x5(const void* _Nonnull in, void* _Nonnull out, size_t vectorSize, size_t sizeX,
+                     size_t sizeY, const float* _Nonnull coefficients,
+                     const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Compute the histogram of an image.
+     *
+     * Tallies how many times each of the 256 possible values of a byte is found in the input.
+     *
+     * An input cell can be represented by one to four bytes. The tally is done independently
+     * for each of the bytes of the cell. Correspondingly, the out array will have
+     * 256 * vectorSize entries. The counts for value 0 are consecutive, followed by those for
+     * value 1, etc.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+     * have a row-major layout. The out buffer should be large enough for 256 * vectorSize ints.
+     *
+     * @param in The buffer of the image to be analyzed.
+     * @param out The resulting vector of counts.
+     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void histogram(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                   size_t vectorSize, const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Compute the histogram of the dot product of an image.
+     *
+     * This method supports cells of 1 to 4 bytes in length. For each cell of the array,
+     * the dot product of its bytes with the provided coefficients is computed. The resulting
+     * floating point value is converted to an unsigned byte and tallied in the histogram.
+     *
+     * If coefficients is null, the coefficients used for RGBA luminosity calculation will be used,
+     * i.e. the values [0.299f, 0.587f, 0.114f, 0.f].
+     *
+     * Each coefficients must be >= 0 and their sum must be 1.0 or less. There must be the same
+     * number of coefficients as vectorSize.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The source buffers should be large enough for sizeX * sizeY * vectorSize bytes. The buffers
+     * have a row-major layout. The out array should be large enough for 256 ints.
+     *
+     * @param in The buffer of the image to be analyzed.
+     * @param out The resulting vector of counts.
+     * @param sizeX The width of the input buffers, as a number of 1 or 4 byte cells.
+     * @param sizeY The height of the input buffers, as a number of 1 or 4 byte cells.
+     * @param vectorSize The number of bytes in each cell, a value from 1 to 4.
+     * @param coefficients The values used for the dot product. Can be nullptr.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void histogramDot(const uint8_t* _Nonnull in, int32_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                      size_t vectorSize, const float* _Nullable coefficients,
+                      const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Transform an image using a look up table
+     *
+     * Transforms an image by using a per-channel lookup table. Each channel of the input has an
+     * independent lookup table. The tables are 256 entries in size and can cover the full value
+     * range of a byte.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be transformed.
+     * @param out The buffer that receives the transformed image.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param red An array of 256 values that's used to convert the R channel.
+     * @param green An array of 256 values that's used to convert the G channel.
+     * @param blue An array of 256 values that's used to convert the B channel.
+     * @param alpha An array of 256 values that's used to convert the A channel.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void lut(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+             const uint8_t* _Nonnull red, const uint8_t* _Nonnull green,
+             const uint8_t* _Nonnull blue, const uint8_t* _Nonnull alpha,
+             const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Transform an image using a 3D look up table
+     *
+     * Transforms an image, converting RGB to RGBA by using a 3D lookup table. The incoming R, G,
+     * and B values are normalized to the dimensions of the provided 3D buffer. The eight nearest
+     * values in that 3D buffer are sampled and linearly interpolated. The resulting RGBA entry
+     * is stored in the output.
+     *
+     * The input array should be in RGBA format, where four consecutive bytes form an cell.
+     * The fourth byte of each input cell is ignored.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of each buffer. If provided, the range must be wholly contained with the dimensions
+     * described by sizeX and sizeY.
+     *
+     * The input and output buffers must have the same dimensions. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes. The buffers have a row-major layout.
+     *
+     * @param in The buffer of the image to be transformed.
+     * @param out The buffer that receives the transformed image.
+     * @param sizeX The width of both buffers, as a number of 4 byte cells.
+     * @param sizeY The height of both buffers, as a number of 4 byte cells.
+     * @param cube The translation cube, in row major-format.
+     * @param cubeSizeX The number of RGBA entries in the cube in the X direction.
+     * @param cubeSizeY The number of RGBA entries in the cube in the Y direction.
+     * @param cubeSizeZ The number of RGBA entries in the cube in the Z direction.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void lut3d(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+               const uint8_t* _Nonnull cube, size_t cubeSizeX, size_t cubeSizeY, size_t cubeSizeZ,
+               const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * Resize an image.
+     *
+     * Resizes an image using bicubic interpolation.
+     *
+     * This method supports cells of 1 to 4 bytes in length. Each byte of the cell is
+     * interpolated independently from the others.
+     *
+     * An optional range parameter can be set to restrict the operation to a rectangular subset
+     * of the output buffer. The corresponding scaled range of the input will be used.  If provided,
+     * the range must be wholly contained with the dimensions described by outputSizeX and
+     * outputSizeY.
+     *
+     * The input and output buffers have a row-major layout. Both buffers should be
+     * large enough for sizeX * sizeY * vectorSize bytes.
+     *
+     * @param in The buffer of the image to be resized.
+     * @param out The buffer that receives the resized image.
+     * @param inputSizeX The width of the input buffer, as a number of 1-4 byte cells.
+     * @param inputSizeY The height of the input buffer, as a number of 1-4 byte cells.
+     * @param vectorSize The number of bytes in each cell of both buffers. A value from 1 to 4.
+     * @param outputSizeX The width of the output buffer, as a number of 1-4 byte cells.
+     * @param outputSizeY The height of the output buffer, as a number of 1-4 byte cells.
+     * @param restriction When not null, restricts the operation to a 2D range of pixels.
+     */
+    void resize(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t inputSizeX,
+                size_t inputSizeY, size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+                const Restriction* _Nullable restriction = nullptr);
+
+    /**
+     * The YUV formats supported by yuvToRgb.
+     */
+    enum class YuvFormat {
+        NV21 = 0x11,
+        YV12 = 0x32315659,
+    };
+
+    /**
+     * Convert an image from YUV to RGB.
+     *
+     * Converts an Android YUV buffer to RGB. The input allocation should be
+     * supplied in a supported YUV format as a YUV cell Allocation.
+     * The output is RGBA; the alpha channel will be set to 255.
+     *
+     * Note that for YV12 and a sizeX that's not a multiple of 32, the
+     * RenderScript Intrinsic may not have converted the image correctly.
+     * This Toolkit method should.
+     *
+     * @param in The buffer of the image to be converted.
+     * @param out The buffer that receives the converted image.
+     * @param sizeX The width in pixels of the image. Must be even.
+     * @param sizeY The height in pixels of the image.
+     * @param format Either YV12 or NV21.
+     */
+    void yuvToRgb(const uint8_t* _Nonnull in, uint8_t* _Nonnull out, size_t sizeX, size_t sizeY,
+                  YuvFormat format);
+};
+
+}  // namespace renderscript
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TOOLKIT_H
diff --git a/renderscript-toolkit/src/main/cpp/Resize.cpp b/renderscript-toolkit/src/main/cpp/Resize.cpp
new file mode 100644
index 0000000..8865e2a
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Resize.cpp
@@ -0,0 +1,767 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <math.h>
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#if defined(ARCH_X86_HAVE_AVX2)
+#include <stdint.h>
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#define LOG_TAG "renderscript.toolkit.Resize"
+
+namespace renderscript {
+
+class ResizeTask : public Task {
+    const uchar* mIn;
+    uchar* mOut;
+    float mScaleX;
+    float mScaleY;
+    size_t mInputSizeX;
+    size_t mInputSizeY;
+
+    void kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelU4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+    void kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    void kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    ResizeTask(const uchar* input, uchar* output, size_t inputSizeX, size_t inputSizeY,
+               size_t vectorSize, size_t outputSizeX, size_t outputSizeY,
+               const Restriction* restriction)
+        : Task{outputSizeX, outputSizeY, vectorSize, false, restriction},
+          mIn{input},
+          mOut{output},
+          mInputSizeX{inputSizeX},
+          mInputSizeY{inputSizeY} {
+        mScaleX = static_cast<float>(inputSizeX) / outputSizeX;
+        mScaleY = static_cast<float>(inputSizeY) / outputSizeY;
+    }
+};
+
+void ResizeTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                             size_t endY) {
+    typedef void (ResizeTask::*KernelFunction)(uchar*, uint32_t, uint32_t, uint32_t);
+
+    KernelFunction kernel;
+    switch (mVectorSize) {
+        case 4:
+            kernel = &ResizeTask::kernelU4;
+            break;
+        case 3:
+            kernel = &ResizeTask::kernelU4;
+            break;
+        case 2:
+            kernel = &ResizeTask::kernelU2;
+            break;
+        case 1:
+            kernel = &ResizeTask::kernelU1;
+            break;
+        default:
+            ALOGE("Bad vector size %zd", mVectorSize);
+    }
+
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = (mSizeX * y + startX) * paddedSize(mVectorSize);
+        uchar* out = mOut + offset;
+        std::invoke(kernel, this, out, startX, endX, y);
+    }
+}
+
+static float4 cubicInterpolate(float4 p0, float4 p1, float4 p2, float4 p3, float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+static float2 cubicInterpolate(float2 p0,float2 p1,float2 p2,float2 p3, float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+   return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 +
+           _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(4.f), _mm_set1_ps(p2),_mm_set1_ps(p3)))
+           + x * (_mm_cvtss_f32(_mm_fmadd_ss (_mm_set1_ps(3.f),_mm_set1_ps(p1 - p2),
+                                              _mm_set1_ps(p3 - p0))))));
+
+}
+#else
+static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
+    //ALOGI("CP, %f, %f, %f, %f, %f", p0, p1, p2, p3, x);
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+#endif
+
+static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float4 p0  = cubicInterpolate(convert<float4>(yp0[xs0]),
+                                  convert<float4>(yp0[xs1]),
+                                  convert<float4>(yp0[xs2]),
+                                  convert<float4>(yp0[xs3]), xf);
+
+    float4 p1  = cubicInterpolate(convert<float4>(yp1[xs0]),
+                                  convert<float4>(yp1[xs1]),
+                                  convert<float4>(yp1[xs2]),
+                                  convert<float4>(yp1[xs3]), xf);
+
+    float4 p2  = cubicInterpolate(convert<float4>(yp2[xs0]),
+                                  convert<float4>(yp2[xs1]),
+                                  convert<float4>(yp2[xs2]),
+                                  convert<float4>(yp2[xs3]), xf);
+
+    float4 p3  = cubicInterpolate(convert<float4>(yp3[xs0]),
+                                  convert<float4>(yp3[xs1]),
+                                  convert<float4>(yp3[xs2]),
+                                  convert<float4>(yp3[xs3]), xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    return convert<uchar4>(p);
+}
+
+static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float2 p0  = cubicInterpolate(convert<float2>(yp0[xs0]),
+                                  convert<float2>(yp0[xs1]),
+                                  convert<float2>(yp0[xs2]),
+                                  convert<float2>(yp0[xs3]), xf);
+
+    float2 p1  = cubicInterpolate(convert<float2>(yp1[xs0]),
+                                  convert<float2>(yp1[xs1]),
+                                  convert<float2>(yp1[xs2]),
+                                  convert<float2>(yp1[xs3]), xf);
+
+    float2 p2  = cubicInterpolate(convert<float2>(yp2[xs0]),
+                                  convert<float2>(yp2[xs1]),
+                                  convert<float2>(yp2[xs2]),
+                                  convert<float2>(yp2[xs3]), xf);
+
+    float2 p3  = cubicInterpolate(convert<float2>(yp3[xs0]),
+                                  convert<float2>(yp3[xs1]),
+                                  convert<float2>(yp3[xs2]),
+                                  convert<float2>(yp3[xs3]), xf);
+
+    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    return convert<uchar2>(p);
+}
+
+static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
+                        float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float p0  = cubicInterpolate((float)yp0[xs0], (float)yp0[xs1],
+                                 (float)yp0[xs2], (float)yp0[xs3], xf);
+    float p1  = cubicInterpolate((float)yp1[xs0], (float)yp1[xs1],
+                                 (float)yp1[xs2], (float)yp1[xs3], xf);
+    float p2  = cubicInterpolate((float)yp2[xs0], (float)yp2[xs1],
+                                 (float)yp2[xs2], (float)yp2[xs3], xf);
+    float p3  = cubicInterpolate((float)yp3[xs0], (float)yp3[xs1],
+                                 (float)yp3[xs2], (float)yp3[xs3], xf);
+
+    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p + 0.5f, 0.f, 255.f);
+    //ALOGI("CUC,%f,%u", p, (uchar)p);
+    return (uchar)p;
+}
+
+extern "C" uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc);
+
+extern "C" void rsdIntrinsicResizeB4_K(
+            uchar4 *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar4 const *srcn,
+            uchar4 const *src0,
+            uchar4 const *src1,
+            uchar4 const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB2_K(
+            uchar2 *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar2 const *srcn,
+            uchar2 const *src0,
+            uchar2 const *src1,
+            uchar2 const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+extern "C" void rsdIntrinsicResizeB1_K(
+            uchar *dst,
+            size_t count,
+            uint32_t xf,
+            uint32_t xinc,
+            uchar const *srcn,
+            uchar const *src0,
+            uchar const *src1,
+            uchar const *src2,
+            size_t xclip,
+            size_t avail,
+            uint64_t osc_ctl,
+            int32_t const *yr);
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+static void mkYCoeff(int32_t *yr, float yf) {
+    int32_t yf1 = rint(yf * 0x10000);
+    int32_t yf2 = rint(yf * yf * 0x10000);
+    int32_t yf3 = rint(yf * yf * yf * 0x10000);
+
+    yr[0] = -(2 * yf2 - yf3 - yf1) >> 1;
+    yr[1] = (3 * yf3 - 5 * yf2 + 0x20000) >> 1;
+    yr[2] = (-3 * yf3 + 4 * yf2 + yf1) >> 1;
+    yr[3] = -(yf3 - yf2) >> 1;
+}
+#endif
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+static float4 OneBiCubic(const float4 *yp0, const float4 *yp1, const float4 *yp2, const float4 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float4 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                  yp0[xs2], yp0[xs3], xf);
+    float4 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                  yp1[xs2], yp1[xs3], xf);
+    float4 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                  yp2[xs2], yp2[xs3], xf);
+    float4 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                  yp3[xs2], yp3[xs3], xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+
+static float2 OneBiCubic(const float2 *yp0, const float2 *yp1, const float2 *yp2, const float2 *yp3,
+                         float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float2 p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                  yp0[xs2], yp0[xs3], xf);
+    float2 p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                  yp1[xs2], yp1[xs3], xf);
+    float2 p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                  yp2[xs2], yp2[xs3], xf);
+    float2 p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                  yp3[xs2], yp3[xs3], xf);
+
+    float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+
+static float OneBiCubic(const float *yp0, const float *yp1, const float *yp2, const float *yp3,
+                        float xf, float yf, int width) {
+    int startx = (int) floor(xf - 1);
+    xf = xf - floor(xf);
+    int maxx = width - 1;
+    int xs0 = std::max(0, startx + 0);
+    int xs1 = std::max(0, startx + 1);
+    int xs2 = std::min(maxx, startx + 2);
+    int xs3 = std::min(maxx, startx + 3);
+
+    float p0  = cubicInterpolate(yp0[xs0], yp0[xs1],
+                                 yp0[xs2], yp0[xs3], xf);
+    float p1  = cubicInterpolate(yp1[xs0], yp1[xs1],
+                                 yp1[xs2], yp1[xs3], xf);
+    float p2  = cubicInterpolate(yp2[xs0], yp2[xs1],
+                                 yp2[xs2], yp2[xs3], xf);
+    float p3  = cubicInterpolate(yp3[xs0], yp3[xs1],
+                                 yp3[xs2], yp3[xs3], xf);
+
+    float p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    return p;
+}
+#endif
+
+void ResizeTask::kernelU4(uchar *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * paddedSize(mVectorSize);
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar4 *yp0 = (const uchar4 *)(pin + stride * ys0);
+    const uchar4 *yp1 = (const uchar4 *)(pin + stride * ys1);
+    const uchar4 *yp2 = (const uchar4 *)(pin + stride * ys2);
+    const uchar4 *yp3 = (const uchar4 *)(pin + stride * ys3);
+
+    uchar4 *out = ((uchar4 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB4_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelU2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * mVectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar2 *yp0 = (const uchar2 *)(pin + stride * ys0);
+    const uchar2 *yp1 = (const uchar2 *)(pin + stride * ys1);
+    const uchar2 *yp2 = (const uchar2 *)(pin + stride * ys2);
+    const uchar2 *yp3 = (const uchar2 *)(pin + stride * ys3);
+
+    uchar2 *out = ((uchar2 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB2_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelU1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    //ALOGI("TK kernelU1 xstart %u, xend %u, outstep %u", xstart, xend);
+    const uchar *pin = mIn;
+    const int srcHeight = mInputSizeY;
+    const int srcWidth = mInputSizeX;
+    const size_t stride = mInputSizeX * mVectorSize;
+
+    // ALOGI("Toolkit   ResizeU1 (%ux%u) by (%f,%f), xstart:%u to %u, stride %zu, out %p", srcWidth,
+    // srcHeight, scaleX, scaleY, xstart, xend, stride, outPtr);
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * mScaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::min(maxy, std::max(0, starty + 1));
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const uchar *yp0 = pin + stride * ys0;
+    const uchar *yp1 = pin + stride * ys1;
+    const uchar *yp2 = pin + stride * ys2;
+    const uchar *yp3 = pin + stride * ys3;
+
+    uchar *out = ((uchar *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if (mUsesSimd && x2 > x1 && mScaleX < 4.0f) {
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+        long xf16 = rint(xf * 0x10000);
+        uint32_t xinc16 = rint(mScaleX * 0x10000);
+
+        int xoff = (xf16 >> 16) - 1;
+        int xclip = std::max(0, xoff) - xoff;
+        int len = x2 - x1;
+
+        int32_t yr[4];
+        uint64_t osc_ctl = rsdIntrinsicResize_oscctl_K(xinc16);
+        mkYCoeff(yr, yf);
+
+        // ALOGI("ys0 %d, ys1 %d, ys2 %d, ys3 %d, x1 %u, x2 %u, xf %f, xf16 %ld, xinc16 %u, xoff %d,
+        // xclip %d, len %d, osc_ctl %lu)",
+        //       ys0, ys1, ys2, ys3, x1, x2, xf, xf16, xinc16, xoff, xclip, len, (unsigned long)
+        //       osc_ctl);
+        // ALOGI("TK scaleX %f, xf %f, xf16 %ld, xinc16 %d, xoff %d, xclip %d, len %d", scaleX, xf,
+        // xf16, xinc16, xoff, xclip, len); ALOGI("TK xf16 & 0xffff %ld, ys0 %u, ys1 %u, ys2 %u, ys3
+        // %u, srcWidth - xoff + xclip %d", xf16 & 0xffff, ys0, ys1, ys2, ys3, srcWidth - xoff);
+
+        xoff += xclip;
+
+        rsdIntrinsicResizeB1_K(
+                out, len,
+                xf16 & 0xffff, xinc16,
+                yp0 + xoff, yp1 + xoff, yp2 + xoff, yp3 + xoff,
+                xclip, srcWidth - xoff + xclip,
+                osc_ctl, yr);
+        out += len;
+        x1 += len;
+    }
+#endif
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * mScaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+void ResizeTask::kernelF4(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(
+            _mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f), _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float4 *yp0 = (const float4 *)(pin + stride * ys0);
+    const float4 *yp1 = (const float4 *)(pin + stride * ys1);
+    const float4 *yp2 = (const float4 *)(pin + stride * ys2);
+    const float4 *yp3 = (const float4 *)(pin + stride * ys3);
+
+    float4 *out = ((float4 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelF2(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float2 *yp0 = (const float2 *)(pin + stride * ys0);
+    const float2 *yp1 = (const float2 *)(pin + stride * ys1);
+    const float2 *yp2 = (const float2 *)(pin + stride * ys2);
+    const float2 *yp3 = (const float2 *)(pin + stride * ys3);
+
+    float2 *out = ((float2 *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::kernelF1(uchar* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    const uchar *pin = mIn;
+    const int srcHeight = inputSizeY;
+    const int srcWidth = inputSizeX;
+    const size_t stride = sizeX * vectorSize;
+
+
+#if defined(ARCH_X86_HAVE_AVX2)
+    float yf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(currentY + 0.5f),
+                                          _mm_set1_ps(scaleY), _mm_set1_ps(0.5f)));
+#else
+    float yf = (currentY + 0.5f) * scaleY - 0.5f;
+#endif
+
+    int starty = (int) floor(yf - 1);
+    yf = yf - floor(yf);
+    int maxy = srcHeight - 1;
+    int ys0 = std::max(0, starty + 0);
+    int ys1 = std::max(0, starty + 1);
+    int ys2 = std::min(maxy, starty + 2);
+    int ys3 = std::min(maxy, starty + 3);
+
+    const float *yp0 = (const float *)(pin + stride * ys0);
+    const float *yp1 = (const float *)(pin + stride * ys1);
+    const float *yp2 = (const float *)(pin + stride * ys2);
+    const float *yp3 = (const float *)(pin + stride * ys3);
+
+    float *out = ((float *)outPtr);
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    while(x1 < x2) {
+
+#if defined(ARCH_X86_HAVE_AVX2)
+        float xf = _mm_cvtss_f32(_mm_fmsub_ss(_mm_set1_ps(x1 + 0.5f) , _mm_set1_ps(scaleX) ,
+                                              _mm_set1_ps(0.5f)));
+#else
+        float xf = (x1 + 0.5f) * scaleX - 0.5f;
+#endif
+
+        *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
+        out++;
+        x1++;
+    }
+}
+
+void ResizeTask::preLaunch(uint32_t slot, const RsScriptCall *sc)
+{
+
+    //check the data type to determine F or U.
+    if (mAlloc->getType()->getElement()->getType() == RS_TYPE_UNSIGNED_8) {
+        switch(mAlloc->getType()->getElement()->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelU1;
+            break;
+        case 2:
+            mRootPtr = &kernelU2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelU4;
+            break;
+        }
+    } else {
+        switch(mAlloc->getType()->getElement()->getVectorSize()) {
+        case 1:
+            mRootPtr = &kernelF1;
+            break;
+        case 2:
+            mRootPtr = &kernelF2;
+            break;
+        case 3:
+        case 4:
+            mRootPtr = &kernelF4;
+            break;
+        }
+    }
+}
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+void RenderScriptToolkit::resize(const uint8_t* input, uint8_t* output, size_t inputSizeX,
+                                 size_t inputSizeY, size_t vectorSize, size_t outputSizeX,
+                                 size_t outputSizeY, const Restriction* restriction) {
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+    if (!validRestriction(LOG_TAG, outputSizeX, outputSizeY, restriction)) {
+        return;
+    }
+    if (vectorSize < 1 || vectorSize > 4) {
+        ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
+        return;
+    }
+#endif
+
+    ResizeTask task((const uchar*)input, (uchar*)output, inputSizeX, inputSizeY, vectorSize,
+                    outputSizeX, outputSizeY, restriction);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Resize_advsimd.S b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
new file mode 100644
index 0000000..59e735c
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot).  It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in v3[0..3], leaving the results in
+ * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * v12.  This gives eight 16-bit results.
+ */
+.macro vert8, dstlo=v12.4h, dsthi=v12.8h
+        ld1         {v8.8b}, [x4], #8
+        ld1         {v9.8b}, [x5], #8
+        ld1         {v10.8b}, [x6], #8
+        ld1         {v11.8b}, [x7], #8
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umull2      v13.4s, v9.8h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlsl2      v13.4s, v8.8h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlal2      v13.4s, v10.8h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+        umlsl2      v13.4s, v11.8h, v3.h[3]
+
+        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+         * minus VERTBITS (the number of fraction bits we want to keep from
+         * here on).
+         */
+        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
+        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
+.endm
+
+/* As above, but only four 16-bit results into v12hi.
+ */
+.macro vert4, dst=v12.8h
+        ld1         {v8.s}[0], [x4], #4
+        ld1         {v9.s}[0], [x5], #4
+        ld1         {v10.s}[0], [x6], #4
+        ld1         {v11.s}[0], [x7], #4
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+.ifc \dst,v12.8h
+        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
+.else
+        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
+.endif
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (x1), and the threshold value for when the count will be
+ * one higher than that (x0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values are packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+        lsl         x2, x0, #VECSHIFT
+        mov         x0, #(CHUNKSIZE << 16) - 1
+        add         x0, x0, x2
+        udiv        x1, x0, x2
+        msub        x0, x1, x2, x0
+        add         x0, x0, x1, LSL #32
+        ret
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions.  Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+
+/* void rsdIntrinsicResizeB1_K(
+ *             uint8_t * restrict dst,          // x0
+ *             size_t count,                    // x1
+ *             uint32_t xf,                     // x2
+ *             uint32_t xinc,                   // x3
+ *             uint8_t const * restrict srcn,   // x4
+ *             uint8_t const * restrict src0,   // x5
+ *             uint8_t const * restrict src1,   // x6
+ *             uint8_t const * restrict src2,   // x7
+ *             size_t xclip,                    // [sp,#0]  -> [sp,#80] -> x12
+ *             size_t avail,                    // [sp,#8]  -> [sp,#88] -> x11
+ *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#96] -> x10
+ *             int32 const *yr,                 // [sp,#24] -> [sp,#104] -> v4   (copied to v3   for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+            sub         x8, sp, #48
+            sub         sp, sp, #80
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            str         x19, [x8, #32]
+
+            /* align the working buffer on the stack to make it easy to use bit
+             * twiddling for address calculations.
+             */
+            sub         x12, sp, #BUFFER_SIZE
+            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
+
+            ldr         x8, [sp,#104]           // yr
+            adrp        x9, intrinsic_resize_consts
+            add         x9, x9, :lo12:intrinsic_resize_consts
+            ld1         {v4.4s}, [x8]
+            ld1         {v5.8h}, [x9]
+            sqxtun      v4.4h, v4.4s            // yr
+            dup         v6.8h, w2
+            dup         v7.8h, w3
+            mla         v6.8h, v5.8h, v7.8h     // vxf
+            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
+
+            /* Compute starting condition for oscillator used to compute ahead
+             * of time how many iterations are possible before needing to
+             * refill the working buffer.  This is based on the fixed-point
+             * index of the last element in the vector of pixels processed in
+             * each iteration, counting up until it would overflow.
+             */
+            sub         x8, x2, x3
+            lsl         x9, x3, #VECSHIFT
+            add         x8, x8, x9
+
+            ldr         x10, [sp,#96]           // osc_ctl
+            ldp         x13,x11, [sp,#80]       // xclip, avail
+
+            mov         x19, sp
+            mov         sp, x12
+
+            /* x4-x7 contain pointers to the four lines of input to be
+             * convolved.  These pointers have been clamped vertically and
+             * horizontally (which is why it's not a simple row/stride pair),
+             * and the xclip argument (now in x13) indicates how many pixels
+             * from true the x position of the pointer is.  This value should
+             * be 0, 1, or 2 only.
+             *
+             * Start by placing four pixels worth of input at the far end of
+             * the buffer.  As many as two of these may be clipped, so four
+             * pixels are fetched, and then the first pixel is duplicated and
+             * the data shifted according to xclip.  The source pointers are
+             * then also adjusted according to xclip so that subsequent fetches
+             * match.
+             */
+            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
+            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
+            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+            add         x14, x14, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+            vert4       v12.4h
+            dup         v11.4h, v12.h[0]
+            st1         {v11.4h,v12.4h}, [x12]
+            ld1         {v12.4h}, [x14]
+            st1         {v12.4h}, [x15]
+.elseif \comp == 2
+            vert8
+            dup         v11.4s, v12.s[0]
+            st1         {v11.8h,v12.8h}, [x12]
+            ld1         {v12.8h}, [x14]
+            st1         {v12.8h}, [x15]
+.elseif \comp == 4
+            vert8       v14.4h, v14.8h
+            vert8       v15.4h, v15.8h
+            dup         v12.2d, v14.d[0]
+            dup         v13.2d, v14.d[0]
+            st1         {v12.8h,v13.8h}, [x12], #32
+            st1         {v14.8h,v15.8h}, [x12]
+            sub         x12, x12, #32
+            ld1         {v11.8h,v12.8h}, [x14]
+            st1         {v11.8h,v12.8h}, [x15]
+.endif
+            /* Count off four pixels into the working buffer.
+             */
+            sub         x11, x11, #4
+            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
+             * were read unconditionally, but some may have been discarded by
+             * xclip, so we rewind the pointers to compensate.
+             */
+            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
+            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
+            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
+            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
+
+            /* First tap starts where we just pre-filled, at the end of the
+             * buffer.
+             */
+            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
+
+            /* Use overflowing arithmetic to implement wraparound array
+             * indexing.
+             */
+            lsl         x2, x2, #(47 - CHUNKSHIFT)
+            lsl         x3, x3, #(47 - CHUNKSHIFT)
+
+
+            /* Start of outermost loop.
+             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+             * number of iterations of the inner loop that can be performed and
+             * get into that.
+             *
+             * The fill is complicated by the possibility of running out of
+             * input before the scratch buffer is filled.  If this isn't a risk
+             * then it's handled by the simple loop at 2:, otherwise the
+             * horrible loop at 3:.
+             */
+1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
+            subs        x11, x11, #CHUNKSIZE
+            bge         2f                      /* if at least CHUNKSIZE are available... */
+            add         x11, x11, #CHUNKSIZE    /* if they're not... */
+            b           4f
+            /* basic fill loop, processing 8 bytes at a time until there are
+             * fewer than eight bytes available.
+             */
+3:          vert8
+            sub         x11, x11, #8 / COMPONENT_COUNT
+            st1         {v12.8h}, [x12], #16
+4:          cmp         x11, #8 / COMPONENT_COUNT - 1
+            bgt         3b
+.if \comp == 4
+            blt         3f
+            /* The last pixel (four bytes) if necessary */
+            vert4
+.else
+            cmp         x11, #1
+            blt         3f
+            /* The last pixels if necessary */
+            sub         x4, x4, #8
+            sub         x5, x5, #8
+            sub         x6, x6, #8
+            sub         x7, x7, #8
+            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
+            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
+            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
+            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
+            vert8
+            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
+            sub         sp, sp, #32
+            sub         x11, x11, #16
+.if \comp == 1
+            dup         v13.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v13.4s, v12.s[3]
+.endif
+            st1         {v12.8h,v13.8h}, [sp]
+            ld1         {v12.8h}, [x11]
+            add         sp, sp, #32
+            b           4f
+.endif
+            /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+            dup         v12.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v12.4s, v12.s[3]
+.elseif \comp == 4
+            dup         v12.2d, v12.d[1]
+.endif
+4:          st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         3b
+            b           4f
+
+.align 4
+2:          /* Quickly pull a chunk of data into the working buffer.
+             */
+            vert8
+            st1         {v12.8h}, [x12], #16
+            vert8
+            st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         2b
+            cmp         x11, #0
+            bne         3f
+4:          /* if we end with 0 pixels left we'll have nothing handy to spread
+             * across to the right, so we rewind a bit.
+             */
+            mov         x11, #1
+            sub         x4, x4, #COMPONENT_COUNT
+            sub         x5, x5, #COMPONENT_COUNT
+            sub         x6, x6, #COMPONENT_COUNT
+            sub         x7, x7, #COMPONENT_COUNT
+3:          /* copy four taps (width of cubic window) to far end for overflow
+             * address handling
+             */
+            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            ld1         {v14.4h}, [x13]
+.elseif \comp == 2
+            ld1         {v14.8h}, [x13]
+.elseif \comp == 4
+            ld1         {v14.8h,v15.8h}, [x13]
+.endif
+            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            st1         {v14.4h}, [x13]
+.elseif \comp == 2
+            st1         {v14.8h}, [x13]
+.elseif \comp == 4
+            st1         {v14.8h,v15.8h}, [x13]
+.endif
+            /* The high 32-bits of x10 contains the maximum possible iteration
+             * count, but if x8 is greater than the low 32-bits of x10 then
+             * this indicates that the count must be reduced by one for this
+             * iteration to avoid reading past the end of the available data.
+             */
+            sub         x13, x10, x8
+            lsr         x13, x13, #32
+
+            madd        x8, x13, x9, x8
+            sub         x8, x8, #(CHUNKSIZE << 16)
+
+            /* prefer to count pixels, rather than vectors, to clarify the tail
+             * store case on exit.
+             */
+            lsl         x13, x13, #VECSHIFT
+            cmp         x13, x1
+            csel        x13, x1, x13, gt
+
+            sub         x1, x1, x13
+
+            lsl         x13, x13, #COMPONENT_SHIFT
+
+            mov         w14, #0x8000
+            movi        v30.8h, #3
+            dup         v31.8h, w14
+
+            cmp         x13, #0
+            bgt         3f
+            cmp         x1, #0
+            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
+            b           9f
+
+            .align 4
+2:          /* Inner loop continues here, but starts at 3:, see end of loop
+             * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            /* Inner loop:  here the four x coefficients for each tap are
+             * calculated in vector code, and the addresses are calculated in
+             * scalar code, and these calculations are interleaved.
+             */
+3:          ushr        v8.8h, v6.8h, #1            // sxf
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
+            add         x2, x2, x3
+            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            sshll       v11.4s, v9.4h, #2
+            sshll2      v12.4s, v9.8h, #2
+            add         x2, x2, x3
+            smlsl       v11.4s, v10.4h, v30.4h
+            smlsl2      v12.4s, v10.8h, v30.8h
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+
+            shadd       v0.8h, v10.8h, v8.8h
+            add         x2, x2, x3
+            sub         v0.8h, v9.8h, v0.8h
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+
+            saddw       v1.4s, v11.4s, v9.4h
+            saddw2      v13.4s, v12.4s, v9.8h
+            add         x2, x2, x3
+            shrn        v1.4h, v1.4s, #1
+            shrn2       v1.8h, v13.4s, #1
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            sub         v1.8h, v1.8h, v31.8h
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+
+            saddw       v2.4s, v11.4s, v8.4h
+            saddw2      v13.4s, v12.4s, v8.8h
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            shrn        v2.4h, v2.4s, #1
+            shrn2       v2.8h, v13.4s, #1
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            neg         v2.8h, v2.8h
+
+            shsub       v3.8h, v10.8h, v9.8h
+
+            /* increment the x fractional parts (oveflow is ignored, as the
+             * scalar arithmetic shadows this addition with full precision).
+             */
+            add         v6.8h, v6.8h, v7.8h
+
+            /* At this point we have four pointers in x8-x11, pointing to the
+             * four taps in the scratch buffer that must be convolved together
+             * to produce an output pixel (one output pixel per pointer).
+             * These pointers usually overlap, but their spacing is irregular
+             * so resolving the redundancy through L1 is a pragmatic solution.
+             *
+             * The scratch buffer is made of signed 16-bit data, holding over
+             * some extra precision, and overshoot, from the vertical pass.
+             *
+             * We also have the 16-bit unsigned fixed-point weights for each
+             * of the four taps in v0 - v3.  That's eight pixels worth of
+             * coefficients when we have only four pointers, so calculations
+             * for four more pixels are interleaved with the fetch and permute
+             * code for each variant in the following code.
+             *
+             * The data arrangement is less than ideal for any pixel format,
+             * but permuting loads help to mitigate most of the problems.
+             *
+             * Note also that the two outside taps of a bicubic are negative,
+             * but these coefficients are unsigned.  The sign is hard-coded by
+             * use of multiply-and-subtract operations.
+             */
+.if \comp == 1
+            /* The uchar 1 case.
+             * Issue one lanewise ld4.h to load four consecutive pixels from
+             * one pointer (one pixel) into four different registers; then load
+             * four consecutive s16 values from the next pointer (pixel) into
+             * the next lane of those four registers, etc., so that we finish
+             * with v12 - v15 representing the four taps, and each lane
+             * representing a separate pixel.
+             *
+             * The first ld4 uses a splat to avoid any false dependency on
+             * the previous state of the register.
+             */
+            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
+
+            smull       v8.4s, v12.4h, v0.4h
+            smull2      v9.4s, v12.8h, v0.8h
+            smlsl       v8.4s, v13.4h, v1.4h
+            smlsl2      v9.4s, v13.8h, v1.8h
+            smlsl       v8.4s, v14.4h, v2.4h
+            smlsl2      v9.4s, v14.8h, v2.8h
+            smlal       v8.4s, v15.4h, v3.4h
+            smlal2      v9.4s, v15.8h, v3.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
+.elseif \comp == 2
+            /* The uchar2 case:
+             * This time load pairs of values into adjacent lanes in v12 - v15
+             * by aliasing them as u32 data; leaving room for only four pixels,
+             * so the process has to be done twice.  This also means that the
+             * coefficient registers fail to align with the coefficient data
+             * (eight separate pixels), so that has to be doubled-up to match.
+             */
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            /* double-up coefficients to align with component pairs */
+            zip1        v16.8h, v0.8h, v0.8h
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            zip1        v17.8h, v1.8h, v1.8h
+            zip1        v18.8h, v2.8h, v2.8h
+            zip1        v19.8h, v3.8h, v3.8h
+
+            smull       v8.4s, v12.4h, v16.4h
+            smull2      v9.4s, v12.8h, v16.8h
+            smlsl       v8.4s, v13.4h, v17.4h
+            smlsl2      v9.4s, v13.8h, v17.8h
+            smlsl       v8.4s, v14.4h, v18.4h
+            smlsl2      v9.4s, v14.8h, v18.8h
+            smlal       v8.4s, v15.4h, v19.4h
+            smlal2      v9.4s, v15.8h, v19.8h
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+
+            /* double-up coefficients to align with component pairs */
+            zip2        v16.8h, v0.8h, v0.8h
+            zip2        v17.8h, v1.8h, v1.8h
+            zip2        v18.8h, v2.8h, v2.8h
+            zip2        v19.8h, v3.8h, v3.8h
+
+            smull       v10.4s, v12.4h, v16.4h
+            smull2      v11.4s, v12.8h, v16.8h
+            smlsl       v10.4s, v13.4h, v17.4h
+            smlsl2      v11.4s, v13.8h, v17.8h
+            smlsl       v10.4s, v14.4h, v18.4h
+            smlsl2      v11.4s, v14.8h, v18.8h
+            smlal       v10.4s, v15.4h, v19.4h
+            smlal2      v11.4s, v15.8h, v19.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+.elseif \comp == 4
+            /* The uchar4 case.
+             * This case is comparatively painless because four s16s are the
+             * smallest addressable unit for a vmul-by-scalar.  Rather than
+             * permute the data, simply arrange the multiplies to suit the way
+             * the data comes in.  That's a lot of data, though, so things
+             * progress in pairs of pixels at a time.
+             */
+            ld1         {v12.8h,v13.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            smull       v8.4s, v12.4h, v0.h[0]
+            smull       v9.4s, v14.4h, v0.h[1]
+            smlsl2      v8.4s, v12.8h, v1.h[0]
+            smlsl2      v9.4s, v14.8h, v1.h[1]
+            smlsl       v8.4s, v13.4h, v2.h[0]
+            smlsl       v9.4s, v15.4h, v2.h[1]
+            smlal2      v8.4s, v13.8h, v3.h[0]
+            smlal2      v9.4s, v15.8h, v3.h[1]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            sqrshrn     v8.4h, v8.4s, #15
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[2]
+            smull       v11.4s, v14.4h, v0.h[3]
+            smlsl2      v10.4s, v12.8h, v1.h[2]
+            smlsl2      v11.4s, v14.8h, v1.h[3]
+            smlsl       v10.4s, v13.4h, v2.h[2]
+            smlsl       v11.4s, v15.4h, v2.h[3]
+            smlal2      v10.4s, v13.8h, v3.h[2]
+            smlal2      v11.4s, v15.8h, v3.h[3]
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x14]
+            ld1         {v14.8h,v15.8h}, [x15]
+
+            smull       v10.4s, v12.4h, v0.h[4]
+            smull       v11.4s, v14.4h, v0.h[5]
+            smlsl2      v10.4s, v12.8h, v1.h[4]
+            smlsl2      v11.4s, v14.8h, v1.h[5]
+            smlsl       v10.4s, v13.4h, v2.h[4]
+            smlsl       v11.4s, v15.4h, v2.h[5]
+            smlal2      v10.4s, v13.8h, v3.h[4]
+            smlal2      v11.4s, v15.8h, v3.h[5]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            ld1         {v14.8h,v15.8h}, [x17]
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[6]
+            smull       v11.4s, v14.4h, v0.h[7]
+            smlsl2      v10.4s, v12.8h, v1.h[6]
+            smlsl2      v11.4s, v14.8h, v1.h[7]
+            smlsl       v10.4s, v13.4h, v2.h[6]
+            smlsl       v11.4s, v15.4h, v2.h[7]
+            smlal2      v10.4s, v13.8h, v3.h[6]
+            smlal2      v11.4s, v15.8h, v3.h[7]
+
+            sqrshrn     v10.4h, v10.4s, #15
+            sqrshrn2    v10.8h, v11.4s, #15
+
+            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
+            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
+.endif
+            bgt         2b      /* continue inner loop */
+            /* The inner loop has already been limited to ensure that none of
+             * the earlier iterations could overfill the output, so the store
+             * appears within the loop but after the conditional branch (at the
+             * top).  At the end, provided it won't overfill, perform the final
+             * store here.  If it would, then break out to the tricky tail case
+             * instead.
+             */
+            blt         1f
+            /* Store the amount of data appropriate to the configuration of the
+             * instance being assembled.
+             */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            b           1b              /* resume outer loop */
+            /* Partial tail store case:
+             * Different versions of the code need different subsets of the
+             * following partial stores.  Here the number of components and the
+             * size of the chunk of data produced by each inner loop iteration
+             * is tested to figure out whether or not each phrase is relevant.
+             */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1:          tst         x13, #16
+            beq         1f
+            st1         {v8.16b}, [x0], #16
+            mov         v8.16b, v9.16b
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1:          tst         x13, #8
+            beq         1f
+            st1         {v8.8b}, [x0], #8
+            ext         v8.16b, v8.16b, v8.16b, #8
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1:          tst         x13, #4
+            beq         1f
+            st1         {v8.s}[0], [x0], #4
+            ext         v8.8b, v8.8b, v8.8b, #4
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1:          tst         x13, #2
+            beq         1f
+            st1         {v8.h}[0], [x0], #2
+            ext         v8.8b, v8.8b, v8.8b, #2
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1:          tst         x13, #1
+            beq         1f
+            st1         {v8.b}[0], [x0], #1
+.endif
+1:
+9:          mov         sp, x19
+            ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldr         x19, [sp], #16
+            ret
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
+
+.rodata
+intrinsic_resize_consts:          .hword      0, 1, 2, 3, 4, 5, 6, 7
diff --git a/renderscript-toolkit/src/main/cpp/Resize_neon.S b/renderscript-toolkit/src/main/cpp/Resize_neon.S
new file mode 100644
index 0000000..eb7f694
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Resize_neon.S
@@ -0,0 +1,799 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot).  It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in d6[0..3], leaving the results in
+ * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * bit of overshoot beyond [0,1.0).
+ */
+.macro vert8, dstlo=d24, dsthi=d25
+        vld1.u8     d16, [r4]!
+        vld1.u8     d18, [r5]!
+        vld1.u8     d20, [r6]!
+        vld1.u8     d22, [r7]!
+        vmovl.u8    q8, d16
+        vmovl.u8    q9, d18
+        vmovl.u8    q10, d20
+        vmovl.u8    q11, d22
+        vmull.u16   q12, d18, d6[1]
+        vmull.u16   q13, d19, d6[1]
+        vmlsl.u16   q12, d16, d6[0]
+        vmlsl.u16   q13, d17, d6[0]
+        vmlal.u16   q12, d20, d6[2]
+        vmlal.u16   q13, d21, d6[2]
+        vmlsl.u16   q12, d22, d6[3]
+        vmlsl.u16   q13, d23, d6[3]
+
+        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+         * minus VERTBITS (the number of fraction bits we want to keep from
+         * here on).
+         */
+        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
+        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
+.endm
+
+/* As above, but only four 16-bit results into d25.
+ */
+.macro vert4
+        vld1.u32    d16[0], [r4]!
+        vld1.u32    d18[0], [r5]!
+        vld1.u32    d20[0], [r6]!
+        vld1.u32    d22[0], [r7]!
+        vmovl.u8    q8, d16
+        vmovl.u8    q9, d18
+        vmovl.u8    q10, d20
+        vmovl.u8    q11, d22
+        vmull.u16   q12, d18, d6[1]
+        vmlsl.u16   q12, d16, d6[0]
+        vmlal.u16   q12, d20, d6[2]
+        vmlsl.u16   q12, d22, d6[3]
+        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (r1), and the threshold value for when the count will be
+ * one higher than that (r0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values can be packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+        lsl         r2, r0, #VECSHIFT
+        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
+        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
+        add         r0, r0, r2
+#if defined(ARCH_ARM_USE_UDIV)
+        udiv        r1, r0, r2
+        mls         r0, r1, r2, r0
+#else
+        clz         r3, r2
+        clz         r1, r0
+        subs        r3, r3, r1
+        movlt       r3, #0
+        mov         r1, #1
+        lsl         r2, r2, r3
+        lsl         r3, r1, r3
+        mov         r1, #0
+1:      cmp         r2, r0
+        addls       r1, r3
+        subls       r0, r2
+        lsrs        r3, r3, #1
+        lsr         r2, r2, #1
+        bne         1b
+#endif
+        bx          lr
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions.  Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+.set OSC_STORE, (BUFFER_SIZE + 0)
+.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
+.set OSCCTL_STORE, (BUFFER_SIZE + 8)
+.set AVAIL_STORE, (BUFFER_SIZE + 16)
+.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
+
+/* void rsdIntrinsicResizeB\comp\()_K(
+ *             uint8_t * restrict dst,          // r0
+ *             size_t count,                    // r1
+ *             uint32_t xf,                     // r2
+ *             uint32_t xinc,                   // r3
+ *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
+ *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
+ *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
+ *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
+ *             size_t xclip,                    // [sp,#16] -> [sp,#120]
+ *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
+ *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
+ *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+            vpush       {d8-d15}
+
+            /* align the working buffer on the stack to make it easy to use bit
+             * twiddling for address calculations and bounds tests.
+             */
+            sub         r12, sp, #BUFFER_SIZE + 32
+            mov         lr, sp
+            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
+            mov         sp, r12
+            str         lr, [sp,#SP_STORE]
+
+            ldr         r8, [lr,#136]           // yr
+            adr         r9, 8f
+            vld1.s32    {q4}, [r8]
+            vld1.s16    {q5}, [r9]
+            vqmovun.s32 d8, q4                  // yr
+            vdup.s16    q6, r2
+            vdup.s16    q7, r3
+            vmla.s16    q6, q5, q7              // vxf
+            vshl.s16    q7, q7, #VECSHIFT       // vxinc
+
+            ldrd        r4,r5, [lr,#104]        // srcn, src0
+            ldrd        r6,r7, [lr,#112]        // src1, src2
+
+            /* Compute starting condition for oscillator used to compute ahead
+             * of time how many iterations are possible before needing to
+             * refill the working buffer.  This is based on the fixed-point
+             * index of the last element in the vector of pixels processed in
+             * each iteration, counting up until it would overflow.
+             */
+            sub         r8, r2, r3
+            mov         r9, r3, LSL #VECSHIFT
+            add         r8, r8, r9
+
+            ldrd        r10,r11, [lr,#128]      // osc_ctl
+
+            str         r8, [sp,#OSC_STORE]
+            str         r9, [sp,#OSCSTEP_STORE]
+            str         r10, [sp,#OSCCTL_STORE]
+            str         r11, [sp,#OSCCTL_STORE+4]
+            ldrd        r10,r11, [lr,#120]      // xclip,avail
+
+
+            /* r4-r7 contain pointers to the four lines of input to be
+             * convolved.  These pointers have been clamped vertically and
+             * horizontally (which is why it's not a simple row/stride pair),
+             * and the xclip argument (now in r10) indicates how many pixels
+             * from true the x position of the pointer is.  This value should
+             * be 0, 1, or 2 only.
+             *
+             * Start by placing four pixels worth of input at the far end of
+             * the buffer.  As many as two of these may be clipped, so four
+             * pixels are fetched, and then the first pixel is duplicated and
+             * the data shifted according to xclip.  The source pointers are
+             * then also adjusted according to xclip so that subsequent fetches
+             * match.
+             */
+            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
+
+            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
+            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+            add         r8, r8, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+            vert4
+            vdup.s16    d24, d25[0]
+            vst1.s16    {q12}, [r12]
+            vld1.s16    {d24}, [r8]
+            vst1.s16    {d24}, [r9]
+.elseif \comp == 2
+            vert8
+            vdup.u32    q11, d24[0]
+            vst1.s16    {q11,q12}, [r12]
+            vld1.s16    {q12}, [r8]
+            vst1.s16    {q12}, [r9]
+.elseif \comp == 4
+            vert8       d28, d29
+            vert8       d30, d31
+            vmov.u64    d24, d28
+            vmov.u64    d25, d28
+            vmov.u64    d26, d28
+            vmov.u64    d27, d28
+            vst1.s16    {q12,q13}, [r12]!
+            vst1.s16    {q14,q15}, [r12]
+            sub         r12, r12, #32
+            vld1.s16    {q11,q12}, [r8]
+            vst1.s16    {q11,q12}, [r9]
+.endif
+            /* Count off four pixels into the working buffer, and move count to
+             * its new home.
+             */
+            sub         lr, r11, #4
+            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
+             * were read unconditionally, but some may have been discarded by
+             * xclip, so we rewind the pointers to compensate.
+             */
+            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
+            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
+            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
+            sub         r7, r7, r10, LSL #COMPONENT_SHIFT
+
+            /* First tap starts where we just pre-filled, at the end of the
+             * buffer.
+             */
+            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
+
+            /* Use overflowing arithmetic to implement wraparound array
+             * indexing.
+             */
+            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
+            mov         r3, r3, LSL #(15 - CHUNKSHIFT)
+
+            str         lr, [sp,#AVAIL_STORE]
+
+            /* Start of outermost loop.
+             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+             * number of iterations of the inner loop that can be performed and
+             * get into that.
+             *
+             * The fill is complicated by the possibility of running out of
+             * input before the scratch buffer is filled.  If this isn't a risk
+             * then it's handled by the simple loop at 2:, otherwise the
+             * horrible loop at 3:.
+             */
+1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
+            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
+            subs        lr, #CHUNKSIZE
+            bge         2f                  /* if at least CHUNKSIZE are available... */
+            add         lr, #CHUNKSIZE      /* if they're not... */
+            b           4f
+            /* ..just sneaking a literal in here after this unconditional branch.. */
+8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
+            /* basic fill loop, processing 8 bytes at a time until there are
+             * fewer than eight bytes available.
+             */
+3:          vert8
+            sub         lr, lr, #8 / COMPONENT_COUNT
+            vst1.s16    {q12}, [r12]!
+4:          cmp         lr, #8 / COMPONENT_COUNT - 1
+            bgt         3b
+.if \comp == 4
+            blt         3f
+            /* The last pixel (four bytes) if necessary */
+            vert4
+.else
+            cmp         lr, #1
+            blt         3f
+            /* The last pixels if necessary */
+            sub         r4, r4, #8
+            sub         r5, r5, #8
+            sub         r6, r6, #8
+            sub         r7, r7, #8
+            add         r4, r4, lr, LSL #COMPONENT_SHIFT
+            add         r5, r5, lr, LSL #COMPONENT_SHIFT
+            add         r6, r6, lr, LSL #COMPONENT_SHIFT
+            add         r7, r7, lr, LSL #COMPONENT_SHIFT
+            vert8
+            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
+            sub         sp, sp, #32
+            sub         lr, lr, #16
+.if \comp == 1
+            vdup.s16    q13, d25[3]
+.elseif \comp == 2
+            vdup.u32    q13, d25[1]
+.endif
+            vst1.s16    {q12,q13}, [sp]
+            vld1.s16    {q12}, [lr]
+            add         sp, sp, #32
+            b           4f
+.endif
+            /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+            vdup.s16    q12, d25[3]
+.elseif \comp == 2
+            vdup.u32    q12, d25[1]
+.elseif \comp == 4
+            vmov.u64    d24, d25
+.endif
+4:          vst1.s16    {q12}, [r12]!
+            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         3b
+            b           4f
+
+.align 4
+2:          /* Quickly pull a chunk of data into the working buffer.
+             */
+            vert8
+            vst1.s16    {q12}, [r12]!
+            vert8
+            vst1.s16    {q12}, [r12]!
+            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         2b
+            cmp         lr, #0
+            bne         3f
+4:          /* if we end with 0 pixels left we'll have nothing handy to spread
+             * across to the right, so we rewind a bit.
+             */
+            mov         lr, #1
+            sub         r4, r4, #COMPONENT_COUNT
+            sub         r5, r5, #COMPONENT_COUNT
+            sub         r6, r6, #COMPONENT_COUNT
+            sub         r7, r7, #COMPONENT_COUNT
+3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
+            add         lr, sp, #OSC_STORE
+            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
+            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
+
+            /* copy four taps (width of cubic window) to far end for overflow
+             * address handling
+             */
+            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            vld1.s16    {d28}, [lr]
+.elseif \comp == 2
+            vld1.s16    {q14}, [lr]
+.elseif \comp == 4
+            vld1.s16    {q14,q15}, [lr]
+.endif
+            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            vst1.s16    {d28}, [lr]
+.elseif \comp == 2
+            vst1.s16    {q14}, [lr]
+.elseif \comp == 4
+            vst1.s16    {q14,q15}, [lr]
+.endif
+            /* r11 contains the maximum possible iteration count, but if r8 is
+             * greater than r10 then this indicates that the count must be
+             * reduced by one for this iteration to avoid reading past the end
+             * of the available data.
+             */
+            cmp             r10, r8
+            sbc         lr, r11, #0
+
+            mla         r8, lr, r9, r8
+            sub         r8, r8, #(CHUNKSIZE << 16)
+
+            str         r8, [sp,#OSC_STORE]         /* done with osc */
+
+            /* prefer to count pixels, rather than vectors, to clarify the tail
+             * store case on exit.
+             */
+            mov         lr, lr, LSL #VECSHIFT
+            cmp         lr, r1
+            movgt       lr, r1
+
+            sub         r1, r1, lr
+
+            mov         lr, lr, LSL #COMPONENT_SHIFT
+
+            vmov.i16    d10, #3
+            vmov.i16    d11, #0x8000
+
+            cmp         lr, #0
+            bgt         3f
+            cmp         r1, #0
+            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
+            b           9f
+
+            .align 4
+2:          /* Inner loop continues here, but starts at 3:, see end of loop
+             * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+            vst1.u32    {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+            vst1.u8     {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+            vst1.u8     {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+            vst1.u8     {q8,q9}, [r0]!
+.endif
+            /* Inner loop:  here the four x coefficients for each tap are
+             * calculated in vector code, and the addresses are calculated in
+             * scalar code, and these calculations are interleaved.
+             */
+3:          vshr.u16    q8, q6, #1
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            vqrdmulh.s16 q9, q8, q8
+            add         r2, r2, r3
+            vqrdmulh.s16 q10, q9, q8
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            vshll.s16   q11, d18, #2
+            vshll.s16   q12, d19, #2
+            add         r2, r2, r3
+            vmlsl.s16   q11, d20, d10
+            vmlsl.s16   q12, d21, d10
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+
+            vhadd.s16   q0, q10, q8
+            add         r2, r2, r3
+            vsub.s16    q0, q9, q0
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+
+            vaddw.s16   q1, q11, d18
+            vaddw.s16   q13, q12, d19
+            add         r2, r2, r3
+            vshrn.s32   d2, q1, #1
+            vshrn.s32   d3, q13, #1
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            vsub.s16    d2, d2, d11
+            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+
+            vaddw.s16   q2, q11, d16
+            vaddw.s16   q13, q12, d17
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            vshrn.s32   d4, q2, #1
+            vshrn.s32   d5, q13, #1
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vneg.s16    q2, q2
+
+            vhsub.s16   q3, q10, q9
+
+            /* increment the x fractional parts (oveflow is ignored, as the
+             * scalar arithmetic shadows this addition with full precision).
+             */
+            vadd.s16    q6, q6, q7
+
+            /* At this point we have four pointers in r8-r11, pointing to the
+             * four taps in the scratch buffer that must be convolved together
+             * to produce an output pixel (one output pixel per pointer).
+             * These pointers usually overlap, but their spacing is irregular
+             * so resolving the redundancy through L1 is a pragmatic solution.
+             *
+             * The scratch buffer is made of signed 16-bit data, holding over
+             * some extra precision, and overshoot, from the vertical pass.
+             *
+             * We also have the 16-bit unsigned fixed-point weights for each
+             * of the four taps in q0 - q3.  That's eight pixels worth of
+             * coefficients when we have only four pointers, so calculations
+             * for four more pixels are interleaved with the fetch and permute
+             * code for each variant in the following code.
+             *
+             * The data arrangement is less than ideal for any pixel format,
+             * but permuting loads help to mitigate most of the problems.
+             *
+             * Note also that the two outside taps of a bicubic are negative,
+             * but these coefficients are unsigned.  The sign is hard-coded by
+             * use of multiply-and-subtract operations.
+             */
+.if \comp == 1
+            /* The uchar 1 case.
+             * Issue one lanewise vld4.s16 to load four consecutive pixels from
+             * one pointer (one pixel) into four different registers; then load
+             * four consecutive s16 values from the next pointer (pixel) into
+             * the next lane of those four registers, etc., so that we finish
+             * with q12 - q15 representing the four taps, and each lane
+             * representing a separate pixel.
+             *
+             * The first vld4 uses a splat to avoid any false dependency on
+             * the previous state of the register.
+             */
+            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
+            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
+            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
+
+            vmull.s16   q8, d24, d0
+            vmull.s16   q9, d25, d1
+            vmlsl.s16   q8, d26, d2
+            vmlsl.s16   q9, d27, d3
+            vmlsl.s16   q8, d28, d4
+            vmlsl.s16   q9, d29, d5
+            vmlal.s16   q8, d30, d6
+            vmlal.s16   q9, d31, d7
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d16, q8, #15
+            vqrshrn.s32 d17, q9, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+.elseif \comp == 2
+            /* The uchar2 case:
+             * This time load pairs of values into adjacent lanes in q12 - q15
+             * by aliasing them as u32 data; leaving room for only four pixels,
+             * so the process has to be done twice.  This also means that the
+             * coefficient registers fail to align with the coefficient data
+             * (eight separate pixels), so that has to be doubled-up to match.
+             */
+            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            /* double-up coefficients to align with component pairs */
+            vmov        d20, d0
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vmov        d21, d2
+            vmov        d22, d4
+            vmov        d23, d6
+            vzip.s16    d0, d20
+            vzip.s16    d2, d21
+            vzip.s16    d4, d22
+            vzip.s16    d6, d23
+
+            vmull.s16   q8, d24, d0
+            vmull.s16   q9, d25, d20
+            vmlsl.s16   q8, d26, d2
+            vmlsl.s16   q9, d27, d21
+            vmlsl.s16   q8, d28, d4
+            vmlsl.s16   q9, d29, d22
+            vmlal.s16   q8, d30, d6
+            vmlal.s16   q9, d31, d23
+
+            vqrshrn.s32 d16, q8, #15
+            vqrshrn.s32 d17, q9, #15
+
+            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
+            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
+            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
+            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
+
+            /* double-up coefficients to align with component pairs */
+            vmov        d0, d1
+            vmov        d2, d3
+            vmov        d4, d5
+            vmov        d6, d7
+            vzip.s16    d0, d1
+            vzip.s16    d2, d3
+            vzip.s16    d4, d5
+            vzip.s16    d6, d7
+
+            vmull.s16   q10, d24, d0
+            vmull.s16   q11, d25, d1
+            vmlsl.s16   q10, d26, d2
+            vmlsl.s16   q11, d27, d3
+            vmlsl.s16   q10, d28, d4
+            vmlsl.s16   q11, d29, d5
+            vmlal.s16   q10, d30, d6
+            vmlal.s16   q11, d31, d7
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+            vqrshrun.s16 d17, q9, #VERTBITS - 8
+.elseif \comp == 4
+            /* The uchar4 case.
+             * This case is comparatively painless because four s16s are the
+             * smallest addressable unit for a vmul-by-scalar.  Rather than
+             * permute the data, simply arrange the multiplies to suit the way
+             * the data comes in.  That's a lot of data, though, so things
+             * progress in pairs of pixels at a time.
+             */
+            vld1.s16    {q12,q13}, [r8]
+            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld1.s16    {q14,q15}, [r9]
+            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
+            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            vmull.s16   q8, d24, d0[0]
+            vmull.s16   q9, d28, d0[1]
+            vmlsl.s16   q8, d25, d2[0]
+            vmlsl.s16   q9, d29, d2[1]
+            vmlsl.s16   q8, d26, d4[0]
+            vmlsl.s16   q9, d30, d4[1]
+            vmlal.s16   q8, d27, d6[0]
+            vmlal.s16   q9, d31, d6[1]
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r10]
+            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
+            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+            vld1.s16    {q14,q15}, [r11]
+            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
+            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
+            add         r2, r2, r3
+
+            vqrshrn.s32 d16, q8, #15
+            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
+            vqrshrn.s32 d17, q9, #15
+
+            vmull.s16   q10, d24, d0[2]
+            vmull.s16   q11, d28, d0[3]
+            vmlsl.s16   q10, d25, d2[2]
+            vmlsl.s16   q11, d29, d2[3]
+            vmlsl.s16   q10, d26, d4[2]
+            vmlsl.s16   q11, d30, d4[3]
+            vmlal.s16   q10, d27, d6[2]
+            vmlal.s16   q11, d31, d6[3]
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vqrshrun.s16 d16, q8, #VERTBITS - 8
+            vqrshrun.s16 d17, q9, #VERTBITS - 8
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r8]
+            vld1.s16    {q14,q15}, [r9]
+
+            vmull.s16   q10, d24, d1[0]
+            vmull.s16   q11, d28, d1[1]
+            vmlsl.s16   q10, d25, d3[0]
+            vmlsl.s16   q11, d29, d3[1]
+            vmlsl.s16   q10, d26, d5[0]
+            vmlsl.s16   q11, d30, d5[1]
+            vmlal.s16   q10, d27, d7[0]
+            vmlal.s16   q11, d31, d7[1]
+
+            /* And two more...  */
+            vld1.s16    {q12,q13}, [r10]
+            vld1.s16    {q14,q15}, [r11]
+
+            subs        lr, lr, #LOOP_OUTPUT_SIZE
+
+            vqrshrn.s32 d18, q10, #15
+            vqrshrn.s32 d19, q11, #15
+
+            vmull.s16   q10, d24, d1[2]
+            vmull.s16   q11, d28, d1[3]
+            vmlsl.s16   q10, d25, d3[2]
+            vmlsl.s16   q11, d29, d3[3]
+            vmlsl.s16   q10, d26, d5[2]
+            vmlsl.s16   q11, d30, d5[3]
+            vmlal.s16   q10, d27, d7[2]
+            vmlal.s16   q11, d31, d7[3]
+
+            vqrshrn.s32 d20, q10, #15
+            vqrshrn.s32 d21, q11, #15
+
+            vqrshrun.s16 d18, q9, #VERTBITS - 8
+            vqrshrun.s16 d19, q10, #VERTBITS - 8
+.endif
+            bgt         2b      /* continue inner loop */
+            /* The inner loop has already been limited to ensure that none of
+             * the earlier iterations could overfill the output, so the store
+             * appears within the loop but after the conditional branch (at the
+             * top).  At the end, provided it won't overfill, perform the final
+             * store here.  If it would, then break out to the tricky tail case
+             * instead.
+             */
+            blt         1f
+            /* Store the amount of data appropriate to the configuration of the
+             * instance being assembled.
+             */
+.if LOOP_OUTPUT_SIZE == 4
+            vst1.u32    {d16[0]}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 8
+            vst1.u8     {d16}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 16
+            vst1.u8     {q8}, [r0]!
+.elseif LOOP_OUTPUT_SIZE == 32
+            vst1.u8     {q8,q9}, [r0]!
+.endif
+            b           1b              /* resume outer loop */
+            /* Partial tail store case:
+             * Different versions of the code need different subsets of the
+             * following partial stores.  Here the number of components and the
+             * size of the chunk of data produced by each inner loop iteration
+             * is tested to figure out whether or not each phrase is relevant.
+             */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1:          tst         lr, #16
+            beq         1f
+            vst1.u8     {q8}, [r0]!
+            vmov        q8, q9
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1:          tst         lr, #8
+            beq         1f
+            vst1.u8     {d16}, [r0]!
+            vmov.u8     d16, d17
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1:          tst         lr, #4
+            beq         1f
+            vst1.u32    {d16[0]}, [r0]!
+            vext.u32    d16, d16, d16, #1
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1:          tst         lr, #2
+            beq         1f
+            vst1.u16    {d16[0]}, [r0]!
+            vext.u16    d16, d16, d16, #1
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1:          tst         lr, #1
+            beq         1f
+            vst1.u8     {d16[0]}, [r0]!
+.endif
+1:
+9:          ldr         sp, [sp,#SP_STORE]
+            vpop        {d8-d15}
+            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
diff --git a/renderscript-toolkit/src/main/cpp/TaskProcessor.cpp b/renderscript-toolkit/src/main/cpp/TaskProcessor.cpp
new file mode 100644
index 0000000..ed50909
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/TaskProcessor.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TaskProcessor.h"
+
+#include <cassert>
+#include <sys/prctl.h>
+
+#include "RenderScriptToolkit.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.TaskProcessor"
+
+namespace renderscript {
+
+int Task::setTiling(unsigned int targetTileSizeInBytes) {
+    // Empirically, values smaller than 1000 are unlikely to give good performance.
+    targetTileSizeInBytes = std::max(1000u, targetTileSizeInBytes);
+    const size_t cellSizeInBytes =
+            mVectorSize;  // If we add float support, vectorSize * 4 for that.
+    const size_t targetCellsPerTile = targetTileSizeInBytes / cellSizeInBytes;
+    assert(targetCellsPerTile > 0);
+
+    size_t cellsToProcessY;
+    size_t cellsToProcessX;
+    if (mRestriction == nullptr) {
+        cellsToProcessX = mSizeX;
+        cellsToProcessY = mSizeY;
+    } else {
+        assert(mRestriction->endX > mRestriction->startX);
+        assert(mRestriction->endY > mRestriction->startY);
+        cellsToProcessX = mRestriction->endX - mRestriction->startX;
+        cellsToProcessY = mRestriction->endY - mRestriction->startY;
+    }
+
+    // We want rows as large as possible, as the SIMD code we have is more efficient with
+    // large rows.
+    mTilesPerRow = divideRoundingUp(cellsToProcessX, targetCellsPerTile);
+    // Once we know the number of tiles per row, we divide that row evenly. We round up to make
+    // sure all cells are included in the last tile of the row.
+    mCellsPerTileX = divideRoundingUp(cellsToProcessX, mTilesPerRow);
+
+    // We do the same thing for the Y direction.
+    size_t targetRowsPerTile = divideRoundingUp(targetCellsPerTile, mCellsPerTileX);
+    mTilesPerColumn = divideRoundingUp(cellsToProcessY, targetRowsPerTile);
+    mCellsPerTileY = divideRoundingUp(cellsToProcessY, mTilesPerColumn);
+
+    return mTilesPerRow * mTilesPerColumn;
+}
+
+void Task::processTile(unsigned int threadIndex, size_t tileIndex) {
+    // Figure out the overall boundaries.
+    size_t startWorkX;
+    size_t startWorkY;
+    size_t endWorkX;
+    size_t endWorkY;
+    if (mRestriction == nullptr) {
+        startWorkX = 0;
+        startWorkY = 0;
+        endWorkX = mSizeX;
+        endWorkY = mSizeY;
+    } else {
+        startWorkX = mRestriction->startX;
+        startWorkY = mRestriction->startY;
+        endWorkX = mRestriction->endX;
+        endWorkY = mRestriction->endY;
+    }
+    // Figure out the rectangle for this tileIndex. All our tiles form a 2D grid. Identify
+    // first the X, Y coordinate of our tile in that grid.
+    size_t tileIndexY = tileIndex / mTilesPerRow;
+    size_t tileIndexX = tileIndex % mTilesPerRow;
+    // Calculate the starting and ending point of that tile.
+    size_t startCellX = startWorkX + tileIndexX * mCellsPerTileX;
+    size_t startCellY = startWorkY + tileIndexY * mCellsPerTileY;
+    size_t endCellX = std::min(startCellX + mCellsPerTileX, endWorkX);
+    size_t endCellY = std::min(startCellY + mCellsPerTileY, endWorkY);
+
+    // Call the derived class to do the specific work.
+    if (mPrefersDataAsOneRow && startCellX == 0 && endCellX == mSizeX) {
+        // When the tile covers entire rows, we can take advantage that some ops are not 2D.
+        processData(threadIndex, 0, startCellY, mSizeX * (endCellY - startCellY), startCellY + 1);
+    } else {
+        processData(threadIndex, startCellX, startCellY, endCellX, endCellY);
+    }
+}
+
+TaskProcessor::TaskProcessor(unsigned int numThreads)
+    : mUsesSimd{cpuSupportsSimd()},
+      /* If the requested number of threads is 0, we'll decide based on the number of cores.
+       * Through empirical testing, we've found that using more than 6 threads does not help.
+       * There may be more optimal choices to make depending on the SoC but we'll stick to
+       * this simple heuristic for now.
+       *
+       * We'll re-use the thread that calls the processor doTask method, so we'll spawn one less
+       * worker pool thread than the total number of threads.
+       */
+      mNumberOfPoolThreads{numThreads ? numThreads - 1
+                                      : std::min(6u, std::thread::hardware_concurrency() - 1)} {
+    for (size_t i = 0; i < mNumberOfPoolThreads; i++) {
+        mPoolThreads.emplace_back(
+                std::bind(&TaskProcessor::processTilesOfWork, this, i + 1, false));
+    }
+}
+
+TaskProcessor::~TaskProcessor() {
+    {
+        std::lock_guard<std::mutex> lock(mQueueMutex);
+        mStopThreads = true;
+        mWorkAvailableOrStop.notify_all();
+    }
+
+    for (auto& thread : mPoolThreads) {
+        thread.join();
+    }
+}
+
+void TaskProcessor::processTilesOfWork(int threadIndex, bool returnWhenNoWork) {
+    if (threadIndex != 0) {
+        // Set the name of the thread, except for thread 0, which is not part of the pool.
+        // PR_SET_NAME takes a maximum of 16 characters, including the terminating null.
+        char name[16]{"RenderScToolkit"};
+        prctl(PR_SET_NAME, name, 0, 0, 0);
+        // ALOGI("Starting thread%d", threadIndex);
+    }
+
+    std::unique_lock<std::mutex> lock(mQueueMutex);
+    while (true) {
+        mWorkAvailableOrStop.wait(lock, [this, returnWhenNoWork]() /*REQUIRES(mQueueMutex)*/ {
+            return mStopThreads || (mTilesNotYetStarted > 0) ||
+                   (returnWhenNoWork && (mTilesNotYetStarted == 0));
+        });
+        // ALOGI("Woke thread%d", threadIndex);
+
+        // This ScopedLockAssertion is to help the compiler when it checks thread annotations
+        // to realize that we have the lock. It's however not completely true; we don't
+        // hold the lock while processing the tile.
+        // TODO Figure out how to fix that.
+        // android::base::ScopedLockAssertion lockAssert(mQueueMutex);
+        if (mStopThreads || (returnWhenNoWork && mTilesNotYetStarted == 0)) {
+            break;
+        }
+
+        while (mTilesNotYetStarted > 0 && !mStopThreads) {
+            // This picks the tiles in decreasing order but that does not matter.
+            int myTile = --mTilesNotYetStarted;
+            mTilesInProcess++;
+            lock.unlock();
+            {
+                // We won't be executing this code unless the main thread is
+                // holding the mTaskMutex lock, which guards mCurrentTask.
+                // The compiler can't figure this out.
+                // android::base::ScopedLockAssertion lockAssert(mTaskMutex);
+                mCurrentTask->processTile(threadIndex, myTile);
+            }
+            lock.lock();
+            mTilesInProcess--;
+            if (mTilesInProcess == 0 && mTilesNotYetStarted == 0) {
+                mWorkIsFinished.notify_one();
+            }
+        }
+    }
+    // if (threadIndex != 0) {
+    //     ALOGI("Ending thread%d", threadIndex);
+    // }
+}
+
+void TaskProcessor::doTask(Task* task) {
+    std::lock_guard<std::mutex> lockGuard(mTaskMutex);
+    task->setUsesSimd(mUsesSimd);
+    mCurrentTask = task;
+    // Notify the thread pool of available work.
+    startWork(task);
+    // Start processing some of the tiles on the calling thread.
+    processTilesOfWork(0, true);
+    // Wait for all the pool workers to complete.
+    waitForPoolWorkersToComplete();
+    mCurrentTask = nullptr;
+}
+
+void TaskProcessor::startWork(Task* task) {
+    /**
+     * The size in bytes that we're hoping each tile will be. If this value is too small,
+     * we'll spend too much time in synchronization. If it's too large, some cores may be
+     * idle while others still have a lot of work to do. Ideally, it would depend on the
+     * device we're running. 16k is the same value used by RenderScript and seems reasonable
+     * from ad-hoc tests.
+     */
+    const size_t targetTileSize = 16 * 1024;
+
+    std::lock_guard<std::mutex> lock(mQueueMutex);
+    assert(mTilesInProcess == 0);
+    mTilesNotYetStarted = task->setTiling(targetTileSize);
+    mWorkAvailableOrStop.notify_all();
+}
+
+void TaskProcessor::waitForPoolWorkersToComplete() {
+    std::unique_lock<std::mutex> lock(mQueueMutex);
+    // The predicate, i.e. the lambda, will make sure that
+    // we terminate even if the main thread calls this after
+    // mWorkIsFinished is signaled.
+    mWorkIsFinished.wait(lock, [this]() /*REQUIRES(mQueueMutex)*/ {
+        return mTilesNotYetStarted == 0 && mTilesInProcess == 0;
+    });
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/TaskProcessor.h b/renderscript-toolkit/src/main/cpp/TaskProcessor.h
new file mode 100644
index 0000000..0c59e25
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/TaskProcessor.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
+
+// #include <android-base/thread_annotations.h>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+namespace renderscript {
+
+/**
+ * Description of the data to be processed for one Toolkit method call, e.g. one blur or one
+ * blend operation.
+ *
+ * The data to be processed is a 2D array of cells. Each cell is a vector of 1 to 4 unsigned bytes.
+ * The most typical configuration is a 2D array of uchar4 used to represent RGBA images.
+ *
+ * This is a base class. There will be a subclass for each Toolkit op.
+ *
+ * Typical usage of a derived class would look like:
+ *    BlurTask task(in, out, sizeX, sizeY, vectorSize, etc);
+ *    processor->doTask(&task);
+ *
+ * The TaskProcessor should call setTiling() and setUsesSimd() once, before calling processTile().
+ * Other classes should not call setTiling(), setUsesSimd(), and processTile().
+ */
+class Task {
+   protected:
+    /**
+     * Number of cells in the X direction.
+     */
+    const size_t mSizeX;
+    /**
+     * Number of cells in the Y direction.
+     */
+    const size_t mSizeY;
+    /**
+     * Number of elements in a vector (cell). From 1-4.
+     */
+    const size_t mVectorSize;
+    /**
+     * Whether the task prefers the processData call to represent the work to be done as
+     * one line rather than a rectangle. This would be the case for work that don't involve
+     * vertical neighbors, e.g. blend or histogram. A task would prefer this to minimize the
+     * number of SIMD calls to make, i.e. have one call that covers all the rows.
+     *
+     * This setting will be used only when a tile covers the entire width of the data to be
+     * processed.
+     */
+    const bool mPrefersDataAsOneRow;
+    /**
+     * Whether the processor we're working on supports SIMD operations.
+     */
+    bool mUsesSimd = false;
+
+   private:
+    /**
+     * If not null, we'll process a subset of the whole 2D array. This specifies the restriction.
+     */
+    const struct Restriction* mRestriction;
+
+    /**
+     * We'll divide the work into rectangular tiles. See setTiling().
+     */
+
+    /**
+     * Size of a tile in the X direction, as a number of cells.
+     */
+    size_t mCellsPerTileX = 0;
+    /**
+     * Size of a tile in the Y direction, as a number of cells.
+     */
+    size_t mCellsPerTileY = 0;
+    /**
+     * Number of tiles per row of the restricted area we're working on.
+     */
+    size_t mTilesPerRow = 0;
+    /**
+     * Number of tiles per column of the restricted area we're working on.
+     */
+    size_t mTilesPerColumn = 0;
+
+   public:
+    /**
+     * Construct a task.
+     *
+     * sizeX and sizeY should be greater than 0. vectorSize should be between 1 and 4.
+     * The restriction should outlive this instance. The Toolkit validates the
+     * arguments so we won't do that again here.
+     */
+    Task(size_t sizeX, size_t sizeY, size_t vectorSize, bool prefersDataAsOneRow,
+         const Restriction* restriction)
+        : mSizeX{sizeX},
+          mSizeY{sizeY},
+          mVectorSize{vectorSize},
+          mPrefersDataAsOneRow{prefersDataAsOneRow},
+          mRestriction{restriction} {}
+    virtual ~Task() {}
+
+    void setUsesSimd(bool uses) { mUsesSimd = uses; }
+
+    /**
+     * Divide the work into a number of tiles that can be distributed to the various threads.
+     * A tile will be a rectangular region. To be robust, we'll want to handle regular cases
+     * like 400x300 but also unusual ones like 1x120000, 120000x1, 1x1.
+     *
+     * We have a target size for the tiles, which corresponds roughly to how much data a thread
+     * will want to process before checking for more work. If the target is set too low, we'll spend
+     * more time in synchronization. If it's too large, some cores may not be used as efficiently.
+     *
+     * This method returns the number of tiles.
+     *
+     * @param targetTileSizeInBytes Target size. Values less than 1000 will be treated as 1000.
+     */
+    int setTiling(unsigned int targetTileSizeInBytes);
+
+    /**
+     * This is called by the TaskProcessor to instruct the task to process a tile.
+     *
+     * @param threadIndex The index of the thread that's processing the tile.
+     * @param tileIndex The index of the tile to process.
+     */
+    void processTile(unsigned int threadIndex, size_t tileIndex);
+
+   private:
+    /**
+     * Call to the derived class to process the data bounded by the rectangle specified
+     * by (startX, startY) and (endX, endY). The end values are EXCLUDED. This rectangle
+     * will be contained with the restriction, if one is provided.
+     */
+    virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                             size_t endY) = 0;
+};
+
+/**
+ * There's one instance of the task processor for the Toolkit. This class owns the thread pool,
+ * and dispatches the tiles of work to the threads.
+ */
+class TaskProcessor {
+    /**
+     * Does this processor support SIMD-like instructions?
+     */
+    const bool mUsesSimd;
+    /**
+     * The number of separate threads we'll spawn. It's one less than the number of threads that
+     * do the work as the client thread that starts the work will also be used.
+     */
+    const unsigned int mNumberOfPoolThreads;
+    /**
+     * Ensures that only one task is done at a time.
+     */
+    std::mutex mTaskMutex;
+    /**
+     * Ensures consistent access to the shared queue state.
+     */
+    std::mutex mQueueMutex;
+    /**
+     * The thread pool workers.
+     */
+    std::vector<std::thread> mPoolThreads;
+    /**
+     * The task being processed, if any. We only do one task at a time. We could create a queue
+     * of tasks but using a mTaskMutex is sufficient for now.
+     */
+    Task* mCurrentTask /*GUARDED_BY(mTaskMutex)*/ = nullptr;
+    /**
+     * Signals that the mPoolThreads should terminate.
+     */
+    bool mStopThreads /*GUARDED_BY(mQueueMutex)*/ = false;
+    /**
+     * Signaled when work is available or the mPoolThreads need to shut down. mStopThreads is used
+     * to distinguish between the two.
+     */
+    std::condition_variable mWorkAvailableOrStop;
+    /**
+     * Signaled when the work for the task is finished.
+     */
+    std::condition_variable mWorkIsFinished;
+    /**
+     * A user task, e.g. a blend or a blur, is split into a number of tiles. When a thread starts
+     * working on a new tile, it uses this count to identify which tile to work on. The tile
+     * number is sufficient to determine the boundaries of the data to process.
+     *
+     * The number of tiles left to process.
+     */
+    int mTilesNotYetStarted /*GUARDED_BY(mQueueMutex)*/ = 0;
+    /**
+     * The number of tiles currently being processed. Must not be greater than
+     * mNumberOfPoolThreads + 1.
+     */
+    int mTilesInProcess /*GUARDED_BY(mQueueMutex)*/ = 0;
+
+    /**
+     * Determines how we'll tile the work and signals the thread pool of available work.
+     *
+     * @param task The task to be performed.
+     */
+    void startWork(Task* task) /*REQUIRES(mTaskMutex)*/;
+
+    /**
+     * Tells the thread to start processing work off the queue.
+     *
+     * The flag is used for prevent the main thread from blocking forever if the work is
+     * so trivial that the worker threads complete the work before the main thread calls this
+     * method.
+     *
+     * @param threadIndex The index number (0..mNumberOfPoolThreads) this thread will referred by.
+     * @param returnWhenNoWork If there's no work, return immediately.
+     */
+    void processTilesOfWork(int threadIndex, bool returnWhenNoWork);
+
+    /**
+     * Wait for the pool workers to complete the work on the current task.
+     */
+    void waitForPoolWorkersToComplete();
+
+   public:
+    /**
+     * Create the processor.
+     *
+     * @param numThreads The total number of threads to use. If 0, we'll decided based on system
+     * properties.
+     */
+    explicit TaskProcessor(unsigned int numThreads = 0);
+
+    ~TaskProcessor();
+
+    /**
+     * Do the specified task. Returns only after the task has been completed.
+     */
+    void doTask(Task* task);
+
+    /**
+     * Some Tasks need to allocate temporary storage for each worker thread.
+     * This provides the number of threads.
+     */
+    unsigned int getNumberOfThreads() const { return mNumberOfPoolThreads + 1; }
+};
+
+}  // namespace renderscript
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_TASKPROCESSOR_H
diff --git a/renderscript-toolkit/src/main/cpp/Utils.cpp b/renderscript-toolkit/src/main/cpp/Utils.cpp
new file mode 100644
index 0000000..f1b33ba
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Utils.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+
+#include <cpu-features.h>
+
+#include "RenderScriptToolkit.h"
+
+namespace renderscript {
+
+#define LOG_TAG "renderscript.toolkit.Utils"
+
+bool cpuSupportsSimd() {
+    AndroidCpuFamily family = android_getCpuFamily();
+    uint64_t features = android_getCpuFeatures();
+
+    if (family == ANDROID_CPU_FAMILY_ARM && (features & ANDROID_CPU_ARM_FEATURE_NEON)) {
+        // ALOGI("Arm with Neon");
+        return true;
+    } else if (family == ANDROID_CPU_FAMILY_ARM64 && (features & ANDROID_CPU_ARM64_FEATURE_ASIMD)) {
+        // ALOGI("Arm64 with ASIMD");
+        return true;
+    } else if ((family == ANDROID_CPU_FAMILY_X86 || family == ANDROID_CPU_FAMILY_X86_64) &&
+               (features & ANDROID_CPU_X86_FEATURE_SSSE3)) {
+        // ALOGI("x86* with SSE3");
+        return true;
+    }
+    // ALOGI("Not simd");
+    return false;
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction) {
+    if (restriction == nullptr) {
+        return true;
+    }
+    if (restriction->startX >= sizeX || restriction->endX > sizeX) {
+        ALOGE("%s. sizeX should be greater than restriction->startX and greater or equal to "
+              "restriction->endX. %zu, %zu, and %zu were provided respectively.",
+              tag, sizeX, restriction->startX, restriction->endY);
+        return false;
+    }
+    if (restriction->startY >= sizeY && restriction->endY > sizeY) {
+        ALOGE("%s. sizeY should be greater than restriction->startY and greater or equal to "
+              "restriction->endY. %zu, %zu, and %zu were provided respectively.",
+              tag, sizeY, restriction->startY, restriction->endY);
+        return false;
+    }
+    if (restriction->startX >= restriction->endX) {
+        ALOGE("%s. Restriction startX should be less than endX. "
+              "%zu and %zu were provided respectively.",
+              tag, restriction->startX, restriction->endX);
+        return false;
+    }
+    if (restriction->startY >= restriction->endY) {
+        ALOGE("%s. Restriction startY should be less than endY. "
+              "%zu and %zu were provided respectively.",
+              tag, restriction->startY, restriction->endY);
+        return false;
+    }
+    return true;
+}
+#endif
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/Utils.h b/renderscript-toolkit/src/main/cpp/Utils.h
new file mode 100644
index 0000000..01c3798
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Utils.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+#define ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
+
+#include <android/log.h>
+#include <stddef.h>
+
+namespace renderscript {
+
+/* The Toolkit does not support floating point buffers but the original RenderScript Intrinsics
+ * did for some operations. That code was preserved and protected by
+ * ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT.
+ */
+// TODO: On final packaging, decide whether this should be define in the build file, and for which
+// config.
+// #define ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
+
+/* If we release the Toolkit as a C++ API, we'll want to enable validation at the C++ level
+ * by uncommenting this define.
+ *
+ * If we only have a Java/Kotlin API, the Kotlin layer does validation. We don't need to duplicate
+ * this effort.
+ */
+#define ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+
+#define ALOGI(...) __android_log_print(ANDROID_LOG_INFO, LOG_TAG, __VA_ARGS__)
+#define ALOGW(...) __android_log_print(ANDROID_LOG_WARN, LOG_TAG, __VA_ARGS__)
+#define ALOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__)
+
+using uchar = unsigned char;
+using uint = unsigned int;
+using ushort = unsigned short;
+
+using uint8_t = uchar;
+using uint16_t = ushort;
+using uint32_t = uint;
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float3 __attribute__((ext_vector_type(3)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef uchar uchar2 __attribute__((ext_vector_type(2)));
+typedef uchar uchar3 __attribute__((ext_vector_type(3)));
+typedef uchar uchar4 __attribute__((ext_vector_type(4)));
+typedef ushort ushort2 __attribute__((ext_vector_type(2)));
+typedef ushort ushort3 __attribute__((ext_vector_type(3)));
+typedef ushort ushort4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
+typedef uint uint3 __attribute__((ext_vector_type(3)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef short short2 __attribute__((ext_vector_type(2)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef short short4 __attribute__((ext_vector_type(4)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+
+template <typename TO, typename TI>
+inline TO convert(TI i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return __builtin_convertvector(i, TO);
+}
+
+template <>
+inline uchar convert(float i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return (uchar)i;
+}
+
+template <>
+inline float convert(uchar i) {
+    // assert(i.x >= 0 && i.y >= 0 && i.z >= 0 && i.w >= 0);
+    // assert(i.x <= 255 && i.y <= 255 && i.z <= 255 && i.w <= 255);
+    return (float)i;
+}
+
+inline int4 clamp(int4 amount, int low, int high) {
+    int4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+inline float4 clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+inline int2 clamp(int2 amount, int low, int high) {
+    int2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+inline float2 clamp(float2 amount, float low, float high) {
+    float2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+inline int clamp(int amount, int low, int high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+inline float clamp(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+#ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
+struct Restriction;
+
+bool validRestriction(const char* tag, size_t sizeX, size_t sizeY, const Restriction* restriction);
+#endif
+
+/**
+ * Returns true if the processor we're running on supports the SIMD instructions that are
+ * used in our assembly code.
+ */
+bool cpuSupportsSimd();
+
+inline size_t divideRoundingUp(size_t a, size_t b) {
+    return a / b + (a % b == 0 ? 0 : 1);
+}
+
+inline size_t paddedSize(size_t size) {
+    return size == 3 ? 4 : size;
+}
+
+}  // namespace renderscript
+
+#endif  // ANDROID_RENDERSCRIPT_TOOLKIT_UTILS_H
diff --git a/renderscript-toolkit/src/main/cpp/YuvToRgb.cpp b/renderscript-toolkit/src/main/cpp/YuvToRgb.cpp
new file mode 100644
index 0000000..741bcc4
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/YuvToRgb.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+
+#include "RenderScriptToolkit.h"
+#include "TaskProcessor.h"
+#include "Utils.h"
+
+#define LOG_TAG "renderscript.toolkit.YuvToRgb"
+
+namespace renderscript {
+
+inline size_t roundUpTo16(size_t val) {
+    return (val + 15u) & ~15u;
+}
+
+class YuvToRgbTask : public Task {
+    uchar4* mOut;
+    size_t mCstep;
+    size_t mStrideY;
+    size_t mStrideU;
+    size_t mStrideV;
+    const uchar* mInY;
+    const uchar* mInU;
+    const uchar* mInV;
+
+    void kernel(uchar4* out, uint32_t xstart, uint32_t xend, uint32_t currentY);
+    // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
+    void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
+                     size_t endY) override;
+
+   public:
+    YuvToRgbTask(const uint8_t* input, uint8_t* output, size_t sizeX, size_t sizeY,
+                 RenderScriptToolkit::YuvFormat format)
+        : Task{sizeX, sizeY, 4, false, nullptr}, mOut{reinterpret_cast<uchar4*>(output)} {
+        switch (format) {
+            case RenderScriptToolkit::YuvFormat::NV21:
+                mCstep = 2;
+                mStrideY = sizeX;
+                mStrideU = mStrideY;
+                mStrideV = mStrideY;
+                mInY = reinterpret_cast<const uchar*>(input);
+                mInV = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+                mInU = mInV + 1;
+                break;
+            case RenderScriptToolkit::YuvFormat::YV12:
+                mCstep = 1;
+                mStrideY = roundUpTo16(sizeX);
+                mStrideU = roundUpTo16(mStrideY >> 1u);
+                mStrideV = mStrideU;
+                mInY = reinterpret_cast<const uchar*>(input);
+                mInU = reinterpret_cast<const uchar*>(input + mStrideY * sizeY);
+                mInV = mInU + mStrideV * sizeY / 2;
+                break;
+        }
+    }
+};
+
+void YuvToRgbTask::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
+                               size_t endY) {
+    for (size_t y = startY; y < endY; y++) {
+        size_t offset = mSizeX * y + startX;
+        uchar4* out = mOut + offset;
+        kernel(out, startX, endX, y);
+    }
+}
+
+static uchar4 rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
+    int16_t Y = ((int16_t)y) - 16;
+    int16_t U = ((int16_t)u) - 128;
+    int16_t V = ((int16_t)v) - 128;
+
+    short4 p;
+    p.x = (Y * 298 + V * 409 + 128) >> 8;
+    p.y = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
+    p.z = (Y * 298 + U * 516 + 128) >> 8;
+    p.w = 255;
+    if(p.x < 0) {
+        p.x = 0;
+    }
+    if(p.x > 255) {
+        p.x = 255;
+    }
+    if(p.y < 0) {
+        p.y = 0;
+    }
+    if(p.y > 255) {
+        p.y = 255;
+    }
+    if(p.z < 0) {
+        p.z = 0;
+    }
+    if(p.z > 255) {
+        p.z = 255;
+    }
+
+    return (uchar4){static_cast<uchar>(p.x), static_cast<uchar>(p.y),
+                    static_cast<uchar>(p.z), static_cast<uchar>(p.w)};
+}
+
+extern "C" void rsdIntrinsicYuv_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+                                  size_t xend);
+extern "C" void rsdIntrinsicYuvR_K(void *dst, const uchar *Y, const uchar *uv, uint32_t xstart,
+                                   size_t xend);
+extern "C" void rsdIntrinsicYuv2_K(void *dst, const uchar *Y, const uchar *u, const uchar *v,
+                                   size_t xstart, size_t xend);
+
+void YuvToRgbTask::kernel(uchar4 *out, uint32_t xstart, uint32_t xend, uint32_t currentY) {
+    //ALOGI("kernel out %p, xstart=%u, xend=%u, currentY=%u", out, xstart, xend, currentY);
+
+    const uchar *y = mInY + (currentY * mStrideY);
+    const uchar *v = mInV + ((currentY >> 1) * mStrideV);
+    const uchar *u = mInU + ((currentY >> 1) * mStrideU);
+
+    //ALOGI("pinY %p, pinV %p, pinU %p", pinY, pinV, pinU);
+
+    uint32_t x1 = xstart;
+    uint32_t x2 = xend;
+
+    /*
+    ALOGE("pinY, %p, Y, %p, currentY, %d, strideY, %zu", pinY, y, currentY, mStrideY);
+    ALOGE("pinU, %p, U, %p, currentY, %d, strideU, %zu", pinU, u, currentY, mStrideU);
+    ALOGE("pinV, %p, V, %p, currentY, %d, strideV, %zu", pinV, v, currentY, mStrideV);
+    ALOGE("dimX, %d, dimY, %d", cp->alloc->mHal.drvState.lod[0].dimX,
+          cp->alloc->mHal.drvState.lod[0].dimY);
+    ALOGE("info->dim.x, %d, info->dim.y, %d", info->dim.x, info->dim.y);
+    uchar* pinY = (uchar*)mInY;
+    uchar* pinU = (uchar*)mInU;
+    uchar* pinV = (uchar*)mInV;
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[0], pinY[1], pinY[2], pinY[3], pinY[4], pinY[5], pinY[6], pinY[7], pinY[8],
+          pinY[9], pinY[10], pinY[11], pinY[12], pinY[13], pinY[14], pinY[15]);
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[16], pinY[17], pinY[18], pinY[19], pinY[20], pinY[21], pinY[22], pinY[23],
+          pinY[24], pinY[25], pinY[26], pinY[27], pinY[28], pinY[29], pinY[30], pinY[31]);
+    ALOGE("Y %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinY, pinY[32], pinY[33], pinY[34], pinY[35], pinY[36], pinY[37], pinY[38], pinY[39],
+          pinY[40], pinY[41], pinY[42], pinY[43], pinY[44], pinY[45], pinY[46], pinY[47]);
+
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[0], pinU[1], pinU[2], pinU[3], pinU[4], pinU[5], pinU[6], pinU[7], pinU[8],
+          pinU[9], pinU[10], pinU[11], pinU[12], pinU[13], pinU[14], pinU[15]);
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[16], pinU[17], pinU[18], pinU[19], pinU[20], pinU[21], pinU[22], pinU[23],
+          pinU[24], pinU[25], pinU[26], pinU[27], pinU[28], pinU[29], pinU[30], pinU[31]);
+    ALOGE("U %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinU, pinU[32], pinU[33], pinU[34], pinU[35], pinU[36], pinU[37], pinU[38], pinU[39],
+          pinU[40], pinU[41], pinU[42], pinU[43], pinU[44], pinU[45], pinU[46], pinU[47]);
+
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[0], pinV[1], pinV[2], pinV[3], pinV[4], pinV[5], pinV[6], pinV[7], pinV[8],
+          pinV[9], pinV[10], pinV[11], pinV[12], pinV[13], pinV[14], pinV[15]);
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[16], pinV[17], pinV[18], pinV[19], pinV[20], pinV[21], pinV[22], pinV[23],
+          pinV[24], pinV[25], pinV[26], pinV[27], pinV[28], pinV[29], pinV[30], pinV[31]);
+    ALOGE("V %p %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx "
+          "%02hhx %02hhx %02hhx %02hhx %02hhx %02hhx %02hhx",
+          pinV, pinV[32], pinV[33], pinV[34], pinV[35], pinV[36], pinV[37], pinV[38], pinV[39],
+          pinV[40], pinV[41], pinV[42], pinV[43], pinV[44], pinV[45], pinV[46], pinV[47]);
+    */
+
+    /* If we start on an odd pixel then deal with it here and bump things along
+     * so that subsequent code can carry on with even-odd pairing assumptions.
+     */
+    if((x1 & 1) && (x2 > x1)) {
+        int cx = (x1 >> 1) * mCstep;
+        *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+        out++;
+        x1++;
+    }
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
+    if((x2 > x1) && mUsesSimd) {
+        int32_t len = x2 - x1;
+        if (mCstep == 1) {
+            rsdIntrinsicYuv2_K(out, y, u, v, x1, x2);
+            x1 += len;
+            out += len;
+        } else if (mCstep == 2) {
+            // Check for proper interleave
+            intptr_t ipu = (intptr_t)u;
+            intptr_t ipv = (intptr_t)v;
+
+            if (ipu == (ipv + 1)) {
+                rsdIntrinsicYuv_K(out, y, v, x1, x2);
+                x1 += len;
+                out += len;
+            } else if (ipu == (ipv - 1)) {
+                rsdIntrinsicYuvR_K(out, y, u, x1, x2);
+                x1 += len;
+                out += len;
+            }
+        }
+    }
+#endif
+
+    if(x2 > x1) {
+       // ALOGE("y %i  %i  %i", currentY, x1, x2);
+        while(x1 < x2) {
+            int cx = (x1 >> 1) * mCstep;
+            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+            *out = rsYuvToRGBA_uchar4(y[x1], u[cx], v[cx]);
+            out++;
+            x1++;
+        }
+    }
+}
+
+void RenderScriptToolkit::yuvToRgb(const uint8_t* input, uint8_t* output, size_t sizeX,
+                                   size_t sizeY, YuvFormat format) {
+    YuvToRgbTask task(input, output, sizeX, sizeY, format);
+    processor->doTask(&task);
+}
+
+}  // namespace renderscript
diff --git a/renderscript-toolkit/src/main/cpp/YuvToRgb_advsimd.S b/renderscript-toolkit/src/main/cpp/YuvToRgb_advsimd.S
new file mode 100644
index 0000000..bb4b7ae
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/YuvToRgb_advsimd.S
@@ -0,0 +1,377 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts with the even and odd
+ * bytes split into the low parts of v8 and v9 respectively.  U and V are in
+ * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern, regu=v10, regv=v11
+        /* v0   out R_lo / even R_lo accumulator
+         * v1   out G_lo / even G_lo accumulator
+         * v2   out B_lo / even B_lo accumulator
+         * v3   out A_lo / const 0xff*ff
+         * v4   out R_hi / even R_hi accumulator
+         * v5   out G_hi / even G_hi accumulator
+         * v6   out B_hi / even B_hi accumulator
+         * v7   out A_hi / const 0xff*ff
+         * v8   even Y   / G_lo luma tmp
+         * v9   odd Y    / G_lo luma tmp
+         * \regu in U
+         * \regv in V
+         * v12  R_lo luma tmp
+         * v13  B_lo luma tmp
+         * v14  R_hi luma tmp
+         * v15  B_hi luma tmp
+         * v16  odd R_lo accumulator
+         * v17  odd G_lo accumulator
+         * v18  odd B_lo accumulator
+         * v19  multiplier extra bits low
+         * v20  odd R_hi accumulator
+         * v21  odd G_hi accumulator
+         * v22  odd B_hi accumulator
+         * v23  multiplier extra bits high
+         * v24  constant 149
+         * v25  constant 50
+         * v26  constant 104
+         * v27  constant 204
+         * v28  constant 254
+         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+         */
+
+        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
+        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
+        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
+        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
+
+        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
+        umlal       v8.8h, \regv\().8b, v26.8b
+        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
+        umlal2      v9.8h, \regv\().16b, v26.16b
+
+        ushr        v19.16b, \regv\().16b, #1
+        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
+        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
+
+        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
+        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
+
+        ushll       v19.8h, \regu\().8b,  #2
+        ushll2      v23.8h, \regu\().16b, #2
+        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
+        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
+
+        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
+        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
+
+        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
+        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
+
+        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
+        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
+
+        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
+        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
+        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
+
+        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
+        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
+        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
+        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
+
+        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
+        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
+        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
+        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
+        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b,  v0.8h,  #6
+        uqrshrn     v16.8b, v16.8h, #6
+        uqrshrn     v1.8b,  v1.8h,  #7
+        uqrshrn     v17.8b, v17.8h, #7
+        uqrshrn     v2.8b,  v2.8h,  #6
+        uqrshrn     v18.8b, v18.8h, #6
+
+        uqrshrn     v4.8b,  v4.8h,  #6
+        uqrshrn     v20.8b, v20.8h, #6
+        uqrshrn     v5.8b,  v5.8h,  #7
+        uqrshrn     v21.8b, v21.8h, #7
+        uqrshrn     v6.8b,  v6.8h,  #6
+        uqrshrn     v22.8b, v22.8h, #6
+
+        zip1        v0.16b, v0.16b, v16.16b
+        zip1        v1.16b, v1.16b, v17.16b
+        zip1        v2.16b, v2.16b, v18.16b
+
+        zip1        v4.16b, v4.16b, v20.16b
+        zip1        v5.16b, v5.16b, v21.16b
+        zip1        v6.16b, v6.16b, v22.16b
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+        movi        v24.16b, #149
+        movi        v25.16b, #50
+        movi        v26.16b, #104
+        movi        v27.16b, #204
+        movi        v28.16b, #254
+        mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        dup         v29.8h, w5
+        mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        dup         v30.8h, w5
+        mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        dup         v31.8h, w5
+
+        movi        v3.16b, #0xff
+        movi        v7.16b, #0xff
+
+        subs        x2, x2, #32
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      ld2         {v8.16b,v9.16b}, [x1], #32
+  .if \interleaved
+        ld2         {v10.16b,v11.16b}, [x3], #32
+  .else
+        ld1         {v10.16b}, [x3], #16
+        ld1         {v11.16b}, [x4], #16
+  .endif
+
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
+        \kernel
+  .endif
+
+        subs        x2, x2, #32
+
+        st4         {v0.16b - v3.16b}, [x0], #64
+        st4         {v4.16b - v7.16b}, [x0], #64
+
+        bhs         1b
+
+2:      adds        x2, x2, #32
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 32
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        movi        v8.8b, #0
+        movi        v9.8b, #0
+        movi        v10.8b, #0
+        movi        v11.8b, #0
+
+        tbz         x2, #4, 1f
+        ld1         {v9.16b}, [x1], #16
+  .if \interleaved
+        ld1         {v11.16b}, [x3], #16
+  .else
+        ld1         {v10.d}[1], [x3], #8
+        ld1         {v11.d}[1], [x4], #8
+  .endif
+1:      tbz         x2, #3, 1f
+        ld1         {v8.d}[1], [x1], #8
+  .if \interleaved
+        ld1         {v10.d}[1], [x3], #8
+  .else
+        ld1         {v10.s}[1], [x3], #4
+        ld1         {v11.s}[1], [x4], #4
+  .endif
+1:      tbz         x2, #2, 1f
+        ld1         {v8.s}[1], [x1], #4
+  .if \interleaved
+        ld1         {v10.s}[1], [x3], #4
+  .else
+        ld1         {v10.h}[1], [x3], #2
+        ld1         {v11.h}[1], [x4], #2
+  .endif
+1:      tbz         x2, #1, 1f
+        ld1         {v8.h}[1], [x1], #2
+  .if \interleaved
+        ld1         {v10.h}[1], [x3], #2
+  .else
+        ld1         {v10.b}[1], [x3], #1
+        ld1         {v11.b}[1], [x4], #1
+  .endif
+1:      tbz         x2, #0, 1f
+        ld1         {v8.b}[1], [x1], #1
+  .if \interleaved
+        ld1         {v10.h}[0], [x3], #2
+  .else
+        ld1         {v10.b}[0], [x3], #1
+        ld1         {v11.b}[0], [x4], #1
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      mov         v12.16b, v8.16b
+        uzp1        v8.16b, v12.16b, v9.16b
+        uzp2        v9.16b, v12.16b, v9.16b
+  .if \interleaved
+        mov         v12.16b, v10.16b
+        uzp1        v10.16b, v12.16b, v11.16b
+        uzp2        v11.16b, v12.16b, v11.16b
+  .endif
+
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
+        \kernel
+  .endif
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        zip1        v16.16b, v0.16b, v2.16b
+        zip2        v18.16b, v0.16b, v2.16b
+        zip1        v17.16b, v1.16b, v3.16b
+        zip2        v19.16b, v1.16b, v3.16b
+        zip1        v0.16b, v16.16b, v17.16b
+        zip2        v1.16b, v16.16b, v17.16b
+        zip1        v2.16b, v18.16b, v19.16b
+        zip2        v3.16b, v18.16b, v19.16b
+
+        /* Luckily v4-v7 don't need to be unzipped because the complete set of
+         * four and can be stored using st4. */
+
+        tbz         x2, #4, 1f
+        st4         {v4.16b - v7.16b}, [x0], #64
+1:      tbz         x2, #3, 1f
+        st1         {v2.16b,v3.16b}, [x0], #32
+1:      tbz         x2, #2, 1f
+        st1         {v1.16b}, [x0], #16
+1:      tbz         x2, #1, 1f
+        st1         {v0.d}[1], [x0], #8
+1:      tbz         x2, #0, 2f
+        st1         {v0.s}[1], [x0], #4
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uin,    // x2
+ *          void const *vin,    // x3
+ *          size_t xstart,      // x4
+ *          size_t xend);       // x5
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        lsr         x6, x4, #1
+        add         x0, x0, x4, LSL #2
+        add         x1, x1, x4
+        add         x4, x3, x6
+        add         x3, x2, x6
+        sub         x2, x5, x6, LSL #1
+
+        sub         x6, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x6]
+
+        wrap_line yuvkern, 0
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        bic         x5, x3, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // x0
+ *          void const *yin,    // x1
+ *          void const *uvin,   // x2
+ *          size_t xstart,      // x3
+ *          size_t xend);       // x4
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        bic         x5, x3, #1
+        add         x0, x0, x5, LSL #2
+        add         x1, x1, x5
+        add         x3, x2, x5
+        sub         x2, x4, x5
+
+        sub         x5, sp, #32
+        sub         sp, sp, #64
+        st1         {v8.1d - v11.1d}, [sp]
+        st1         {v12.1d - v15.1d}, [x5]
+
+        wrap_line yuvkern, 1
+
+        ld1         {v8.1d - v11.1d}, [sp], #32
+        ld1         {v12.1d - v15.1d}, [sp], #32
+        ret
+END(rsdIntrinsicYuvR_K)
diff --git a/renderscript-toolkit/src/main/cpp/YuvToRgb_neon.S b/renderscript-toolkit/src/main/cpp/YuvToRgb_neon.S
new file mode 100644
index 0000000..5c3bce4
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/YuvToRgb_neon.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define END(f) .fnend; .size f, .-f;
+
+.eabi_attribute 25,1 @Tag_ABI_align8_preserved
+.arm
+
+/* Perform the actual YuvToRGB conversion in a macro, from register to
+ * register.  This macro will be called from within several different wrapper
+ * variants for different data layouts.  Y data starts in q8, but with the even
+ * and odd bytes split into d16 and d17 respectively.  U and V are in d20
+ * and d21.  Working constants are pre-loaded into q13-q15, and q3 is
+ * pre-loaded with a constant 0xff alpha channel.
+ *
+ * The complicated arithmetic is the result of refactoring the original
+ * equations to avoid 16-bit overflow without losing any precision.
+ */
+.macro yuvkern
+        vmov.i8     d15, #149
+
+        vmull.u8    q1, d16, d15        // g0 = y0 * 149
+        vmull.u8    q5, d17, d15        // g1 = y1 * 149
+
+        vmov.i8     d14, #50
+        vmov.i8     d15, #104
+        vmull.u8    q8, d20, d14        // g2 = u * 50 + v * 104
+        vmlal.u8    q8, d21, d15
+
+        vshr.u8     d14, d21, #1
+        vaddw.u8    q0, q1, d14         // r0 = y0 * 149 + (v >> 1)
+        vaddw.u8    q4, q5, d14         // r1 = y1 * 149 + (v >> 1)
+
+        vshll.u8    q7, d20, #2
+        vadd.u16    q2, q1, q7          // b0 = y0 * 149 + (u << 2)
+        vadd.u16    q6, q5, q7          // b1 = y1 * 149 + (u << 2)
+
+        vmov.i8     d14, #204
+        vmov.i8     d15, #254
+        vmull.u8    q11, d21, d14       // r2 = v * 204
+        vmull.u8    q12, d20, d15       // b2 = u * 254
+
+        vhadd.u16   q0, q11             // r0 = (r0 + r2) >> 1
+        vhadd.u16   q4, q11             // r1 = (r1 + r2) >> 1
+        vqadd.u16   q1, q14             // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vqadd.u16   q5, q14             // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vhadd.u16   q2, q12             // b0 = (b0 + b2) >> 1
+        vhadd.u16   q6, q12             // b1 = (b1 + b2) >> 1
+
+        vqsub.u16   q0, q13             // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q4, q13             // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vqsub.u16   q1, q8              // g0 = satu16(g0 - g2)
+        vqsub.u16   q5, q8              // g1 = satu16(g1 - g2)
+        vqsub.u16   q2, q15             // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vqsub.u16   q6, q15             // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        vqrshrn.u16 d0, q0, #6
+        vqrshrn.u16 d1, q1, #7
+        vqrshrn.u16 d2, q4, #6
+        vqrshrn.u16 d3, q5, #7
+        vqrshrn.u16 d4, q2, #6
+        vqrshrn.u16 d5, q6, #6
+
+        vzip.u8     q0, q1
+        vzip.u8     d4, d5
+.endm
+
+/* Define the wrapper code which will load and store the data, iterate the
+ * correct number of times, and safely handle the remainder at the end of the
+ * loop.  Some sections of code are switched out depending on the data packing
+ * being handled.
+ */
+.macro wrap_line kernel, interleaved=0, swapuv=0
+
+        movw        r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        vdup.i16    q13, r5
+        movw        r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        vdup.i16    q14, r5
+        movw        r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        vdup.i16    q15, r5
+
+        vmov.i8     q3, #0xff
+
+        subs        r2, #16
+        bhs         1f
+        b           2f
+
+        .align 4
+1:      vld2.u8     {d16,d17}, [r1]!
+        pld         [r1, #256]
+  .if \interleaved
+        vld2.u8     {d20,d21}, [r3]!
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+        pld         [r3, #256]
+  .else
+        vld1.u8     d20, [r3]!
+        vld1.u8     d21, [r4]!
+        pld         [r3, #128]
+        pld         [r4, #128]
+  .endif
+
+        \kernel
+
+        subs        r2, #16
+
+        vst4.u8     {d0,d2,d4,d6}, [r0]!
+        vst4.u8     {d1,d3,d5,d7}, [r0]!
+
+        bhs         1b
+
+2:      adds        r2, #16
+        beq         2f
+
+        /* To handle the tail portion of the data (something less than 16
+         * bytes) load small power-of-two chunks into working registers.  It
+         * doesn't matter where they end up in the register; the same process
+         * will store them back out using the same positions and the
+         * interaction between neighbouring pixels is constrained to odd
+         * boundaries where the load operations don't interfere.
+         */
+        vmov.i8     q8, #0
+        vmov.i8     q10, #0
+
+        tst         r2, #8
+        beq         1f
+        vld1.u8     d17, [r1]!
+  .if \interleaved
+        vld1.u8     d21, [r3]!
+  .else
+        vld1.u32    d20[1], [r3]!
+        vld1.u32    d21[1], [r4]!
+  .endif
+
+1:      tst         r2, #4
+        beq         1f
+        vld1.u32    d16[1], [r1]!
+  .if \interleaved
+        vld1.u32    d20[1], [r3]!
+  .else
+        vld1.u16    d20[1], [r3]!
+        vld1.u16    d21[1], [r4]!
+  .endif
+1:      tst         r2, #2
+        beq         1f
+        vld1.u16    d16[1], [r1]!
+  .if \interleaved
+        vld1.u16    d20[1], [r3]!
+  .else
+        vld1.u8     d20[1], [r3]!
+        vld1.u8     d21[1], [r4]!
+  .endif
+1:      tst         r2, #1
+        beq         1f
+        vld1.u8     d16[1], [r1]!
+  .if \interleaved
+        vld1.u16    d20[0], [r3]!
+  .else
+        vld1.u8     d20[0], [r3]!
+        vld1.u8     d21[0], [r4]!
+  .endif
+
+        /* One small impediment in the process above is that some of the load
+         * operations can't perform byte-wise structure deinterleaving at the
+         * same time as loading only part of a register.  So the data is loaded
+         * linearly and unpacked manually at this point if necessary.
+         */
+1:      vuzp.8      d16, d17
+  .if \interleaved
+        vuzp.8      d20, d21
+    .if \swapuv
+        vswp        d20, d21
+    .endif
+  .endif
+
+        \kernel
+
+        /* As above but with the output; structured stores for partial vectors
+         * aren't available, so the data is re-packed first and stored linearly.
+         */
+        vzip.8  q0, q2
+        vzip.8  q1, q3
+        vzip.8  q0, q1
+        vzip.8  q2, q3
+
+1:      tst         r2, #8
+        beq         1f
+        vst1.u8     {d4,d5,d6,d7}, [r0]!
+
+1:      tst         r2, #4
+        beq         1f
+        vst1.u8     {d2,d3}, [r0]!
+1:      tst         r2, #2
+        beq         1f
+        vst1.u8     d1, [r0]!
+1:      tst         r2, #1
+        beq         2f
+        vst1.u32    d0[1], [r0]!
+2:
+.endm
+
+
+/*  void rsdIntrinsicYuv2_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uin,    // r2
+ *          void const *vin,    // r3
+ *          size_t xstart,      // [sp]
+ *          size_t xend);       // [sp+#4]
+ */
+ENTRY(rsdIntrinsicYuv2_K)
+        push        {r4,r5}
+        ldr         r5, [sp, #8]
+        mov         r4, r3
+        mov         r3, r2
+        ldr         r2, [sp, #12]
+
+        add         r0, r5, LSL #2
+        add         r1, r5
+        add         r3, r5, LSR #1
+        add         r4, r5, LSR #1
+        sub         r2, r5
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 0
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv2_K)
+
+/*  void rsdIntrinsicYuv_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuv_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuv_K)
+
+/*  void rsdIntrinsicYuvR_K(
+ *          void *out,          // r0
+ *          void const *yin,    // r1
+ *          void const *uvin,   // r2
+ *          size_t xstart,      // r3
+ *          size_t xend);       // [sp]
+ */
+ENTRY(rsdIntrinsicYuvR_K)
+        push        {r4,r5}
+        bic         r4, r3, #1
+        add         r3, r2, r4
+        ldr         r2, [sp, #8]
+
+        add         r0, r4, LSL #2
+        add         r1, r4
+        sub         r2, r4
+
+        vpush       {d8-d15}
+
+        wrap_line yuvkern, 1
+
+        vpop        {d8-d15}
+        pop         {r4,r5}
+        bx lr
+END(rsdIntrinsicYuvR_K)
diff --git a/renderscript-toolkit/src/main/cpp/x86.cpp b/renderscript-toolkit/src/main/cpp/x86.cpp
new file mode 100644
index 0000000..ac3df27
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/x86.cpp
@@ -0,0 +1,1321 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdint.h>
+#include <x86intrin.h>
+
+namespace renderscript {
+
+/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
+static inline __m128i cvtepu8_epi32(__m128i x) {
+#if defined(__SSE4_1__)
+    return _mm_cvtepu8_epi32(x);
+#elif defined(__SSSE3__)
+    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
+    x = _mm_shuffle_epi8(x, M8to32);
+    return x;
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
+#if defined(__SSE4_1__)
+    return _mm_packus_epi32(lo, hi);
+#elif defined(__SSSE3__)
+    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
+    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
+    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
+    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
+    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
+    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
+    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
+    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
+    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
+                        _mm_shuffle_epi8(hi, M32to16H));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+static inline __m128i mullo_epi32(__m128i x, __m128i y) {
+#if defined(__SSE4_1__)
+    return _mm_mullo_epi32(x, y);
+#elif defined(__SSSE3__)
+    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
+    __m128i even = _mm_mul_epu32(x, y);
+    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
+                                _mm_srli_si128(y, 4));
+    even = _mm_and_si128(even, Meven);
+    odd = _mm_and_si128(odd, Meven);
+    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+/* 'mask' must packed 8-bit of 0x00 or 0xff */
+static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
+#if defined(__SSE4_1__)
+    return _mm_blendv_epi8(x, y, mask);
+#elif defined(__SSSE3__)
+    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
+#else
+#   error "Require at least SSSE3"
+#endif
+}
+
+extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const short *coef, uint32_t count) {
+    __m128i x;
+    __m128i c0, c2, c4, c6, c8;
+    __m128i r0, r1, r2;
+    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
+    __m128i o0, o1;
+    uint32_t i;
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0 = _mm_shuffle_epi32(x, 0x00);
+    c2 = _mm_shuffle_epi32(x, 0x55);
+    x = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c4 = _mm_shuffle_epi32(x, 0x00);
+    c6 = _mm_shuffle_epi32(x, 0x55);
+    x = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c8 = _mm_shuffle_epi32(x, 0x00);
+
+    for (i = 0; i < count; ++i) {
+
+        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
+        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+
+        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
+        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
+
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
+
+        o0 = _mm_srai_epi32(o0, 8);
+        o1 = _mm_srai_epi32(o1, 8);
+
+        o0 = packus_epi32(o0, o1);
+        o0 = _mm_packus_epi16(o0, o0);
+        _mm_storel_epi64((__m128i *)dst, o0);
+
+        y0 = (const char *)y0 + 8;
+        y1 = (const char *)y1 + 8;
+        y2 = (const char *)y2 + 8;
+        dst = (char *)dst + 8;
+    }
+}
+
+void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_load_si128((const __m128i *)src);
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
+
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = _mm_srai_epi32(y2, 8);
+        z2 = _mm_srai_epi32(z2, 8);
+        w2 = _mm_srai_epi32(w2, 8);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_loadu_si128((const __m128i *)src);
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
+        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
+        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
+
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
+        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
+        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = _mm_srai_epi32(y2, 8);
+        z2 = _mm_srai_epi32(z2, 8);
+        w2 = _mm_srli_epi32(zw, 16);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
+                                  const short *coef, uint32_t count) {
+    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                      14, 10, 6, 2,
+                                      13,  9, 5, 1,
+                                      12,  8, 4, 0);
+    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
+    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
+    __m128i c0, c1, c2, c3;
+    __m128i i4, o4;
+    __m128i xy, zw;
+    __m128i x2, y2, z2, w2;
+    uint32_t i;
+
+    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0 = _mm_shufflelo_epi16(c0, 0);
+    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c1 = _mm_shufflelo_epi16(c1, 0);
+    c0 = _mm_unpacklo_epi16(c0, c1);
+
+    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c2 = _mm_shufflelo_epi16(c2, 0);
+    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c3 = _mm_shufflelo_epi16(c3, 0);
+    c2 = _mm_unpacklo_epi16(c2, c3);
+
+    for (i = 0; i < count; ++i) {
+        i4 = _mm_loadu_si128((const __m128i *)src);
+
+        xy = _mm_shuffle_epi8(i4, Mxy);
+        zw = _mm_shuffle_epi8(i4, Mzw);
+
+        x2 =  _mm_madd_epi16(xy, c0);
+        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
+
+        x2 = _mm_srai_epi32(x2, 8);
+        y2 = x2;
+        z2 = x2;
+        w2 = _mm_srli_epi32(zw, 16);
+
+        x2 = packus_epi32(x2, y2);
+        z2 = packus_epi32(z2, w2);
+        o4 = _mm_packus_epi16(x2, z2);
+
+        o4 = _mm_shuffle_epi8(o4, T4x4);
+        _mm_storeu_si128((__m128i *)dst, o4);
+
+        src = (const char *)src + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicBlurVFU4_K(void *dst,
+                          const void *pin, int stride, const void *gptr,
+                          int rct, int x1, int x2) {
+    const char *pi;
+    __m128i pi0, pi1;
+    __m128 pf0, pf1;
+    __m128 bp0, bp1;
+    __m128 x;
+    int r;
+
+    for (; x1 < x2; x1 += 2) {
+        pi = (const char *)pin + (x1 << 2);
+        bp0 = _mm_setzero_ps();
+        bp1 = _mm_setzero_ps();
+
+        for (r = 0; r < rct; ++r) {
+            x = _mm_load_ss((const float *)gptr + r);
+            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
+            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
+
+            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
+            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
+
+            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
+            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
+
+            pi += stride;
+        }
+
+        _mm_storeu_ps((float *)dst, bp0);
+        _mm_storeu_ps((float *)dst + 4, bp1);
+        dst = (char *)dst + 32;
+    }
+}
+
+void rsdIntrinsicBlurHFU4_K(void *dst,
+                          const void *pin, const void *gptr,
+                          int rct, int x1, int x2) {
+    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+    const float *pi;
+    __m128 pf, x, y;
+    __m128i o;
+    int r;
+
+    for (; x1 < x2; ++x1) {
+        /* rct is define as 2*r+1 by the caller */
+        x = _mm_load_ss((const float *)gptr);
+        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+
+        pi = (const float *)pin + (x1 << 2);
+        pf = _mm_mul_ps(x, _mm_load_ps(pi));
+
+        for (r = 1; r < rct; r += 2) {
+            x = _mm_load_ss((const float *)gptr + r);
+            y = _mm_load_ss((const float *)gptr + r + 1);
+            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
+            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
+
+            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
+            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
+        }
+
+        o = _mm_cvtps_epi32(pf);
+        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+        dst = (char *)dst + 4;
+    }
+}
+
+void rsdIntrinsicBlurHFU1_K(void *dst,
+                          const void *pin, const void *gptr,
+                          int rct, int x1, int x2) {
+    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
+    const float *pi;
+    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
+    __m128i o;
+    int r;
+
+    for (; x1 < x2; x1+=4) {
+        g0 = _mm_load_ss((const float *)gptr);
+        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
+
+        pi = (const float *)pin + x1;
+        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
+
+        for (r = 1; r < rct; r += 4) {
+            gx = _mm_loadu_ps((const float *)gptr + r);
+            p0 = _mm_loadu_ps(pi + r);
+            p1 = _mm_loadu_ps(pi + r + 4);
+
+            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
+            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
+            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
+            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
+            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
+        }
+
+        o = _mm_cvtps_epi32(pf);
+        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
+        dst = (char *)dst + 4;
+    }
+}
+
+void rsdIntrinsicYuv_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pUV,
+                       uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, UV, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        UV = _mm_sub_epi32(UV, biasUV);
+
+        U = _mm_shuffle_epi32(UV, 0xf5);
+        V = _mm_shuffle_epi32(UV, 0xa0);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pUV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+void rsdIntrinsicYuvR_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pUV,
+                       uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, UV, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        UV = _mm_sub_epi32(UV, biasUV);
+
+        V = _mm_shuffle_epi32(UV, 0xf5);
+        U = _mm_shuffle_epi32(UV, 0xa0);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pUV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+void rsdIntrinsicYuv2_K(void *dst,
+                       const unsigned char *pY, const unsigned char *pU,
+                       const unsigned char *pV, uint32_t count, const short *param) {
+    __m128i biasY, biasUV;
+    __m128i c0, c1, c2, c3, c4;
+
+    biasY = _mm_set1_epi32(param[8]);   /*  16 */
+    biasUV = _mm_set1_epi32(param[16]); /* 128 */
+
+    c0 = _mm_set1_epi32(param[0]);  /*  298 */
+    c1 = _mm_set1_epi32(param[1]);  /*  409 */
+    c2 = _mm_set1_epi32(param[2]);  /* -100 */
+    c3 = _mm_set1_epi32(param[3]);  /*  516 */
+    c4 = _mm_set1_epi32(param[4]);  /* -208 */
+
+    __m128i Y, U, V, R, G, B, A;
+
+    A = _mm_set1_epi32(255);
+    uint32_t i;
+
+    for (i = 0; i < (count << 1); ++i) {
+        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
+        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
+		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
+
+        Y = _mm_sub_epi32(Y, biasY);
+        U = _mm_sub_epi32(U, biasUV);
+		V = _mm_sub_epi32(V, biasUV);
+
+        Y = mullo_epi32(Y, c0);
+
+        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
+        R = _mm_add_epi32(R, biasUV);
+        R = _mm_srai_epi32(R, 8);
+
+        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
+        G = _mm_add_epi32(G, mullo_epi32(V, c4));
+        G = _mm_add_epi32(G, biasUV);
+        G = _mm_srai_epi32(G, 8);
+
+        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
+        B = _mm_add_epi32(B, biasUV);
+        B = _mm_srai_epi32(B, 8);
+
+        __m128i y1, y2, y3, y4;
+
+        y1 = packus_epi32(R, G);
+        y2 = packus_epi32(B, A);
+        y3 = _mm_packus_epi16(y1, y2);
+        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
+                                          14, 10, 6, 2,
+                                          13,  9, 5, 1,
+                                          12,  8, 4, 0);
+        y4 = _mm_shuffle_epi8(y3, T4x4);
+        _mm_storeu_si128((__m128i *)dst, y4);
+        pY += 4;
+        pU += 4;
+		pV += 4;
+        dst = (__m128i *)dst + 1;
+    }
+}
+
+extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
+                                          const void *y1, const void *y2,
+                                          const void *y3, const void *y4,
+                                          const short *coef, uint32_t count) {
+    __m128i x;
+    __m128i c0, c2, c4, c6, c8, c10, c12;
+    __m128i c14, c16, c18, c20, c22, c24;
+    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
+    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
+    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
+    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
+    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
+    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
+    __m128i o0, o1, o2, o3;
+    uint32_t i;
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+0));
+    c0  = _mm_shuffle_epi32(x, 0x00);
+    c2  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+4));
+    c4  = _mm_shuffle_epi32(x, 0x00);
+    c6  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+8));
+    c8  = _mm_shuffle_epi32(x, 0x00);
+    c10  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+12));
+    c12  = _mm_shuffle_epi32(x, 0x00);
+    c14  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+16));
+    c16  = _mm_shuffle_epi32(x, 0x00);
+    c18  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+20));
+    c20  = _mm_shuffle_epi32(x, 0x00);
+    c22  = _mm_shuffle_epi32(x, 0x55);
+
+    x = _mm_loadl_epi64((const __m128i *)(coef+24));
+    c24  = _mm_shuffle_epi32(x, 0x00);
+
+    for (i = 0; i < count; ++i) {
+
+        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
+        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
+        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
+        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
+        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
+        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
+        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
+        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
+
+        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
+        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
+        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
+        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
+        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
+        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
+        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
+        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
+
+        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
+        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
+        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
+        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
+        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
+        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
+        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
+        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
+
+        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
+        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
+        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
+        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
+        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
+        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
+        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
+        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
+
+        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
+        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
+        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
+        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
+        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
+        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
+        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
+        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
+
+        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
+        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
+        o0 = _mm_srai_epi32(o0, 8);
+
+        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
+        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
+        o1 = _mm_srai_epi32(o1, 8);
+
+        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
+        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
+        o2 = _mm_srai_epi32(o2, 8);
+
+        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
+        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
+        o3 = _mm_srai_epi32(o3, 8);
+
+        o0 = packus_epi32(o0, o1);
+        o2 = packus_epi32(o2, o3);
+        o0 = _mm_packus_epi16(o0, o2);
+        _mm_storeu_si128((__m128i *)dst, o0);
+
+        y0 = (const char *)y0 + 16;
+        y1 = (const char *)y1 + 16;
+        y2 = (const char *)y2 + 16;
+        y3 = (const char *)y3 + 16;
+        y4 = (const char *)y4 + 16;
+        dst = (char *)dst + 16;
+    }
+}
+
+void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, ina, ins;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+        t0 = _mm_srli_epi16(t0, 8);
+        t0 = _mm_add_epi16(t0, ins);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+        t1 = _mm_srli_epi16(t1, 8);
+        t1 = _mm_add_epi16(t1, ins);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+        t2 = _mm_srli_epi16(t2, 8);
+        t2 = _mm_add_epi16(t2, ins);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+        t3 = _mm_srli_epi16(t3, 8);
+        t3 = _mm_add_epi16(t3, ins);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, outa, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+        t0 = _mm_srli_epi16(t0, 8);
+        t0 = _mm_add_epi16(t0, outs);
+
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+        t1 = _mm_srli_epi16(t1, 8);
+        t1 = _mm_add_epi16(t1, outs);
+
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+        t2 = _mm_srli_epi16(t2, 8);
+        t2 = _mm_add_epi16(t2, outs);
+
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+        t3 = _mm_srli_epi16(t3, 8);
+        t3 = _mm_add_epi16(t3, outs);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
+    __m128i outa;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, outa);
+        t0 = _mm_srli_epi16(t0, 8);
+
+        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, outa);
+        t1 = _mm_srli_epi16(t1, 8);
+
+        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, outa);
+        t2 = _mm_srli_epi16(t2, 8);
+
+        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, outa);
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
+    __m128i ina;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, ina);
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, ina);
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, ina);
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, ina);
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, outa;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outa, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
+    __m128i all1s, ina;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ina, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
+    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+    __m128i all1s, ina, outa, ins, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_sub_epi16(all1s, ina);
+        t0 = _mm_mullo_epi16(t0, outs);
+        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_sub_epi16(all1s, ina);
+        t1 = _mm_mullo_epi16(t1, outs);
+        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_sub_epi16(all1s, ina);
+        t2 = _mm_mullo_epi16(t2, outs);
+        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_sub_epi16(all1s, ina);
+        t3 = _mm_mullo_epi16(t3, outs);
+        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t0 = blendv_epi8(t0, out0, M0001);
+        t2 = _mm_packus_epi16(t2, t3);
+        t2 = blendv_epi8(t2, out1, M0001);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
+    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
+    __m128i all1s, ina, ins, outa, outs;
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    all1s = _mm_set1_epi16(255);
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t0 = _mm_sub_epi16(all1s, outa);
+        t0 = _mm_mullo_epi16(t0, ins);
+        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t1 = _mm_sub_epi16(all1s, outa);
+        t1 = _mm_mullo_epi16(t1, ins);
+        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t2 = _mm_sub_epi16(all1s, outa);
+        t2 = _mm_mullo_epi16(t2, ins);
+        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        ina = _mm_shufflelo_epi16(ins, 0xFF);
+        ina = _mm_shufflehi_epi16(ina, 0xFF);
+        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
+        outa = _mm_shufflelo_epi16(outs, 0xFF);
+        outa = _mm_shufflehi_epi16(outa, 0xFF);
+        t3 = _mm_sub_epi16(all1s, outa);
+        t3 = _mm_mullo_epi16(t3, ins);
+        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t0 = blendv_epi8(t0, in0, M0001);
+        t2 = _mm_packus_epi16(t2, t3);
+        t2 = blendv_epi8(t2, in1, M0001);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_xor_si128(out0, in0);
+        out1 = _mm_xor_si128(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    __m128i t0, t1, t2, t3;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
+        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
+        t0 = _mm_srli_epi16(t0, 8);
+
+        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
+        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
+        t1 = _mm_srli_epi16(t1, 8);
+
+        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
+        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
+        t2 = _mm_srli_epi16(t2, 8);
+
+        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
+        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
+        t3 = _mm_srli_epi16(t3, 8);
+
+        t0 = _mm_packus_epi16(t0, t1);
+        t2 = _mm_packus_epi16(t2, t3);
+        _mm_storeu_si128((__m128i *)dst, t0);
+        _mm_storeu_si128((__m128i *)dst + 1, t2);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_adds_epu8(out0, in0);
+        out1 = _mm_adds_epu8(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
+    __m128i in0, in1, out0, out1;
+    uint32_t i;
+
+    for (i = 0; i < count8; ++i) {
+        in0 = _mm_loadu_si128((const __m128i *)src);
+        in1 = _mm_loadu_si128((const __m128i *)src + 1);
+        out0 = _mm_loadu_si128((const __m128i *)dst);
+        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
+
+        out0 = _mm_subs_epu8(out0, in0);
+        out1 = _mm_subs_epu8(out1, in1);
+
+        _mm_storeu_si128((__m128i *)dst, out0);
+        _mm_storeu_si128((__m128i *)dst + 1, out1);
+
+        src = (const __m128i *)src + 2;
+        dst = (__m128i *)dst + 2;
+    }
+}
+
+}  // namespace renderscript