1 files changed, 754 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Resize_advsimd.S b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
new file mode 100644
index 0000000..59e735c
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot).  It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in v3[0..3], leaving the results in
+ * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * v12.  This gives eight 16-bit results.
+ */
+.macro vert8, dstlo=v12.4h, dsthi=v12.8h
+        ld1         {v8.8b}, [x4], #8
+        ld1         {v9.8b}, [x5], #8
+        ld1         {v10.8b}, [x6], #8
+        ld1         {v11.8b}, [x7], #8
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umull2      v13.4s, v9.8h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlsl2      v13.4s, v8.8h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlal2      v13.4s, v10.8h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+        umlsl2      v13.4s, v11.8h, v3.h[3]
+
+        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+         * minus VERTBITS (the number of fraction bits we want to keep from
+         * here on).
+         */
+        sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
+        sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
+.endm
+
+/* As above, but only four 16-bit results into v12hi.
+ */
+.macro vert4, dst=v12.8h
+        ld1         {v8.s}[0], [x4], #4
+        ld1         {v9.s}[0], [x5], #4
+        ld1         {v10.s}[0], [x6], #4
+        ld1         {v11.s}[0], [x7], #4
+        uxtl        v8.8h, v8.8b
+        uxtl        v9.8h, v9.8b
+        uxtl        v10.8h, v10.8b
+        uxtl        v11.8h, v11.8b
+        umull       v12.4s, v9.4h, v3.h[1]
+        umlsl       v12.4s, v8.4h, v3.h[0]
+        umlal       v12.4s, v10.4h, v3.h[2]
+        umlsl       v12.4s, v11.4h, v3.h[3]
+.ifc \dst,v12.8h
+        sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
+.else
+        sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
+.endif
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (x1), and the threshold value for when the count will be
+ * one higher than that (x0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values are packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+        lsl         x2, x0, #VECSHIFT
+        mov         x0, #(CHUNKSIZE << 16) - 1
+        add         x0, x0, x2
+        udiv        x1, x0, x2
+        msub        x0, x1, x2, x0
+        add         x0, x0, x1, LSL #32
+        ret
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions.  Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+
+/* void rsdIntrinsicResizeB1_K(
+ *             uint8_t * restrict dst,          // x0
+ *             size_t count,                    // x1
+ *             uint32_t xf,                     // x2
+ *             uint32_t xinc,                   // x3
+ *             uint8_t const * restrict srcn,   // x4
+ *             uint8_t const * restrict src0,   // x5
+ *             uint8_t const * restrict src1,   // x6
+ *             uint8_t const * restrict src2,   // x7
+ *             size_t xclip,                    // [sp,#0]  -> [sp,#80] -> x12
+ *             size_t avail,                    // [sp,#8]  -> [sp,#88] -> x11
+ *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#96] -> x10
+ *             int32 const *yr,                 // [sp,#24] -> [sp,#104] -> v4   (copied to v3   for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+            sub         x8, sp, #48
+            sub         sp, sp, #80
+            st1         {v8.1d - v11.1d}, [sp]
+            st1         {v12.1d - v15.1d}, [x8]
+            str         x19, [x8, #32]
+
+            /* align the working buffer on the stack to make it easy to use bit
+             * twiddling for address calculations.
+             */
+            sub         x12, sp, #BUFFER_SIZE
+            bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
+
+            ldr         x8, [sp,#104]           // yr
+            adrp        x9, intrinsic_resize_consts
+            add         x9, x9, :lo12:intrinsic_resize_consts
+            ld1         {v4.4s}, [x8]
+            ld1         {v5.8h}, [x9]
+            sqxtun      v4.4h, v4.4s            // yr
+            dup         v6.8h, w2
+            dup         v7.8h, w3
+            mla         v6.8h, v5.8h, v7.8h     // vxf
+            shl         v7.8h, v7.8h, #VECSHIFT // vxinc
+
+            /* Compute starting condition for oscillator used to compute ahead
+             * of time how many iterations are possible before needing to
+             * refill the working buffer.  This is based on the fixed-point
+             * index of the last element in the vector of pixels processed in
+             * each iteration, counting up until it would overflow.
+             */
+            sub         x8, x2, x3
+            lsl         x9, x3, #VECSHIFT
+            add         x8, x8, x9
+
+            ldr         x10, [sp,#96]           // osc_ctl
+            ldp         x13,x11, [sp,#80]       // xclip, avail
+
+            mov         x19, sp
+            mov         sp, x12
+
+            /* x4-x7 contain pointers to the four lines of input to be
+             * convolved.  These pointers have been clamped vertically and
+             * horizontally (which is why it's not a simple row/stride pair),
+             * and the xclip argument (now in x13) indicates how many pixels
+             * from true the x position of the pointer is.  This value should
+             * be 0, 1, or 2 only.
+             *
+             * Start by placing four pixels worth of input at the far end of
+             * the buffer.  As many as two of these may be clipped, so four
+             * pixels are fetched, and then the first pixel is duplicated and
+             * the data shifted according to xclip.  The source pointers are
+             * then also adjusted according to xclip so that subsequent fetches
+             * match.
+             */
+            mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
+            sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
+            add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+            add         x14, x14, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+            vert4       v12.4h
+            dup         v11.4h, v12.h[0]
+            st1         {v11.4h,v12.4h}, [x12]
+            ld1         {v12.4h}, [x14]
+            st1         {v12.4h}, [x15]
+.elseif \comp == 2
+            vert8
+            dup         v11.4s, v12.s[0]
+            st1         {v11.8h,v12.8h}, [x12]
+            ld1         {v12.8h}, [x14]
+            st1         {v12.8h}, [x15]
+.elseif \comp == 4
+            vert8       v14.4h, v14.8h
+            vert8       v15.4h, v15.8h
+            dup         v12.2d, v14.d[0]
+            dup         v13.2d, v14.d[0]
+            st1         {v12.8h,v13.8h}, [x12], #32
+            st1         {v14.8h,v15.8h}, [x12]
+            sub         x12, x12, #32
+            ld1         {v11.8h,v12.8h}, [x14]
+            st1         {v11.8h,v12.8h}, [x15]
+.endif
+            /* Count off four pixels into the working buffer.
+             */
+            sub         x11, x11, #4
+            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
+             * were read unconditionally, but some may have been discarded by
+             * xclip, so we rewind the pointers to compensate.
+             */
+            sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
+            sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
+            sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
+            sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
+
+            /* First tap starts where we just pre-filled, at the end of the
+             * buffer.
+             */
+            add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
+
+            /* Use overflowing arithmetic to implement wraparound array
+             * indexing.
+             */
+            lsl         x2, x2, #(47 - CHUNKSHIFT)
+            lsl         x3, x3, #(47 - CHUNKSHIFT)
+
+
+            /* Start of outermost loop.
+             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+             * number of iterations of the inner loop that can be performed and
+             * get into that.
+             *
+             * The fill is complicated by the possibility of running out of
+             * input before the scratch buffer is filled.  If this isn't a risk
+             * then it's handled by the simple loop at 2:, otherwise the
+             * horrible loop at 3:.
+             */
+1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
+            subs        x11, x11, #CHUNKSIZE
+            bge         2f                      /* if at least CHUNKSIZE are available... */
+            add         x11, x11, #CHUNKSIZE    /* if they're not... */
+            b           4f
+            /* basic fill loop, processing 8 bytes at a time until there are
+             * fewer than eight bytes available.
+             */
+3:          vert8
+            sub         x11, x11, #8 / COMPONENT_COUNT
+            st1         {v12.8h}, [x12], #16
+4:          cmp         x11, #8 / COMPONENT_COUNT - 1
+            bgt         3b
+.if \comp == 4
+            blt         3f
+            /* The last pixel (four bytes) if necessary */
+            vert4
+.else
+            cmp         x11, #1
+            blt         3f
+            /* The last pixels if necessary */
+            sub         x4, x4, #8
+            sub         x5, x5, #8
+            sub         x6, x6, #8
+            sub         x7, x7, #8
+            add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
+            add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
+            add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
+            add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
+            vert8
+            sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
+            sub         sp, sp, #32
+            sub         x11, x11, #16
+.if \comp == 1
+            dup         v13.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v13.4s, v12.s[3]
+.endif
+            st1         {v12.8h,v13.8h}, [sp]
+            ld1         {v12.8h}, [x11]
+            add         sp, sp, #32
+            b           4f
+.endif
+            /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+            dup         v12.8h, v12.h[7]
+.elseif \comp == 2
+            dup         v12.4s, v12.s[3]
+.elseif \comp == 4
+            dup         v12.2d, v12.d[1]
+.endif
+4:          st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         3b
+            b           4f
+
+.align 4
+2:          /* Quickly pull a chunk of data into the working buffer.
+             */
+            vert8
+            st1         {v12.8h}, [x12], #16
+            vert8
+            st1         {v12.8h}, [x12], #16
+            tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+            bne         2b
+            cmp         x11, #0
+            bne         3f
+4:          /* if we end with 0 pixels left we'll have nothing handy to spread
+             * across to the right, so we rewind a bit.
+             */
+            mov         x11, #1
+            sub         x4, x4, #COMPONENT_COUNT
+            sub         x5, x5, #COMPONENT_COUNT
+            sub         x6, x6, #COMPONENT_COUNT
+            sub         x7, x7, #COMPONENT_COUNT
+3:          /* copy four taps (width of cubic window) to far end for overflow
+             * address handling
+             */
+            sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+            eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            ld1         {v14.4h}, [x13]
+.elseif \comp == 2
+            ld1         {v14.8h}, [x13]
+.elseif \comp == 4
+            ld1         {v14.8h,v15.8h}, [x13]
+.endif
+            add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+            st1         {v14.4h}, [x13]
+.elseif \comp == 2
+            st1         {v14.8h}, [x13]
+.elseif \comp == 4
+            st1         {v14.8h,v15.8h}, [x13]
+.endif
+            /* The high 32-bits of x10 contains the maximum possible iteration
+             * count, but if x8 is greater than the low 32-bits of x10 then
+             * this indicates that the count must be reduced by one for this
+             * iteration to avoid reading past the end of the available data.
+             */
+            sub         x13, x10, x8
+            lsr         x13, x13, #32
+
+            madd        x8, x13, x9, x8
+            sub         x8, x8, #(CHUNKSIZE << 16)
+
+            /* prefer to count pixels, rather than vectors, to clarify the tail
+             * store case on exit.
+             */
+            lsl         x13, x13, #VECSHIFT
+            cmp         x13, x1
+            csel        x13, x1, x13, gt
+
+            sub         x1, x1, x13
+
+            lsl         x13, x13, #COMPONENT_SHIFT
+
+            mov         w14, #0x8000
+            movi        v30.8h, #3
+            dup         v31.8h, w14
+
+            cmp         x13, #0
+            bgt         3f
+            cmp         x1, #0
+            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
+            b           9f
+
+            .align 4
+2:          /* Inner loop continues here, but starts at 3:, see end of loop
+             * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            /* Inner loop:  here the four x coefficients for each tap are
+             * calculated in vector code, and the addresses are calculated in
+             * scalar code, and these calculations are interleaved.
+             */
+3:          ushr        v8.8h, v6.8h, #1            // sxf
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
+            add         x2, x2, x3
+            sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            sshll       v11.4s, v9.4h, #2
+            sshll2      v12.4s, v9.8h, #2
+            add         x2, x2, x3
+            smlsl       v11.4s, v10.4h, v30.4h
+            smlsl2      v12.4s, v10.8h, v30.8h
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+
+            shadd       v0.8h, v10.8h, v8.8h
+            add         x2, x2, x3
+            sub         v0.8h, v9.8h, v0.8h
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+
+            saddw       v1.4s, v11.4s, v9.4h
+            saddw2      v13.4s, v12.4s, v9.8h
+            add         x2, x2, x3
+            shrn        v1.4h, v1.4s, #1
+            shrn2       v1.8h, v13.4s, #1
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            sub         v1.8h, v1.8h, v31.8h
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+
+            saddw       v2.4s, v11.4s, v8.4h
+            saddw2      v13.4s, v12.4s, v8.8h
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            shrn        v2.4h, v2.4s, #1
+            shrn2       v2.8h, v13.4s, #1
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            neg         v2.8h, v2.8h
+
+            shsub       v3.8h, v10.8h, v9.8h
+
+            /* increment the x fractional parts (oveflow is ignored, as the
+             * scalar arithmetic shadows this addition with full precision).
+             */
+            add         v6.8h, v6.8h, v7.8h
+
+            /* At this point we have four pointers in x8-x11, pointing to the
+             * four taps in the scratch buffer that must be convolved together
+             * to produce an output pixel (one output pixel per pointer).
+             * These pointers usually overlap, but their spacing is irregular
+             * so resolving the redundancy through L1 is a pragmatic solution.
+             *
+             * The scratch buffer is made of signed 16-bit data, holding over
+             * some extra precision, and overshoot, from the vertical pass.
+             *
+             * We also have the 16-bit unsigned fixed-point weights for each
+             * of the four taps in v0 - v3.  That's eight pixels worth of
+             * coefficients when we have only four pointers, so calculations
+             * for four more pixels are interleaved with the fetch and permute
+             * code for each variant in the following code.
+             *
+             * The data arrangement is less than ideal for any pixel format,
+             * but permuting loads help to mitigate most of the problems.
+             *
+             * Note also that the two outside taps of a bicubic are negative,
+             * but these coefficients are unsigned.  The sign is hard-coded by
+             * use of multiply-and-subtract operations.
+             */
+.if \comp == 1
+            /* The uchar 1 case.
+             * Issue one lanewise ld4.h to load four consecutive pixels from
+             * one pointer (one pixel) into four different registers; then load
+             * four consecutive s16 values from the next pointer (pixel) into
+             * the next lane of those four registers, etc., so that we finish
+             * with v12 - v15 representing the four taps, and each lane
+             * representing a separate pixel.
+             *
+             * The first ld4 uses a splat to avoid any false dependency on
+             * the previous state of the register.
+             */
+            ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
+            ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
+
+            smull       v8.4s, v12.4h, v0.4h
+            smull2      v9.4s, v12.8h, v0.8h
+            smlsl       v8.4s, v13.4h, v1.4h
+            smlsl2      v9.4s, v13.8h, v1.8h
+            smlsl       v8.4s, v14.4h, v2.4h
+            smlsl2      v9.4s, v14.8h, v2.8h
+            smlal       v8.4s, v15.4h, v3.4h
+            smlal2      v9.4s, v15.8h, v3.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
+.elseif \comp == 2
+            /* The uchar2 case:
+             * This time load pairs of values into adjacent lanes in v12 - v15
+             * by aliasing them as u32 data; leaving room for only four pixels,
+             * so the process has to be done twice.  This also means that the
+             * coefficient registers fail to align with the coefficient data
+             * (eight separate pixels), so that has to be doubled-up to match.
+             */
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            /* double-up coefficients to align with component pairs */
+            zip1        v16.8h, v0.8h, v0.8h
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            zip1        v17.8h, v1.8h, v1.8h
+            zip1        v18.8h, v2.8h, v2.8h
+            zip1        v19.8h, v3.8h, v3.8h
+
+            smull       v8.4s, v12.4h, v16.4h
+            smull2      v9.4s, v12.8h, v16.8h
+            smlsl       v8.4s, v13.4h, v17.4h
+            smlsl2      v9.4s, v13.8h, v17.8h
+            smlsl       v8.4s, v14.4h, v18.4h
+            smlsl2      v9.4s, v14.8h, v18.8h
+            smlal       v8.4s, v15.4h, v19.4h
+            smlal2      v9.4s, v15.8h, v19.8h
+
+            sqrshrn     v8.4h, v8.4s, #15
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+
+            /* double-up coefficients to align with component pairs */
+            zip2        v16.8h, v0.8h, v0.8h
+            zip2        v17.8h, v1.8h, v1.8h
+            zip2        v18.8h, v2.8h, v2.8h
+            zip2        v19.8h, v3.8h, v3.8h
+
+            smull       v10.4s, v12.4h, v16.4h
+            smull2      v11.4s, v12.8h, v16.8h
+            smlsl       v10.4s, v13.4h, v17.4h
+            smlsl2      v11.4s, v13.8h, v17.8h
+            smlsl       v10.4s, v14.4h, v18.4h
+            smlsl2      v11.4s, v14.8h, v18.8h
+            smlal       v10.4s, v15.4h, v19.4h
+            smlal2      v11.4s, v15.8h, v19.8h
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+.elseif \comp == 4
+            /* The uchar4 case.
+             * This case is comparatively painless because four s16s are the
+             * smallest addressable unit for a vmul-by-scalar.  Rather than
+             * permute the data, simply arrange the multiplies to suit the way
+             * the data comes in.  That's a lot of data, though, so things
+             * progress in pairs of pixels at a time.
+             */
+            ld1         {v12.8h,v13.8h}, [x14]
+            lsr         x14, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x15]
+            add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x15, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            smull       v8.4s, v12.4h, v0.h[0]
+            smull       v9.4s, v14.4h, v0.h[1]
+            smlsl2      v8.4s, v12.8h, v1.h[0]
+            smlsl2      v9.4s, v14.8h, v1.h[1]
+            smlsl       v8.4s, v13.4h, v2.h[0]
+            smlsl       v9.4s, v15.4h, v2.h[1]
+            smlal2      v8.4s, v13.8h, v3.h[0]
+            smlal2      v9.4s, v15.8h, v3.h[1]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x16, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+            ld1         {v14.8h,v15.8h}, [x17]
+            add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+            lsr         x17, x2, #(63 - CHUNKSHIFT)
+            add         x2, x2, x3
+
+            sqrshrn     v8.4h, v8.4s, #15
+            add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+            sqrshrn2    v8.8h, v9.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[2]
+            smull       v11.4s, v14.4h, v0.h[3]
+            smlsl2      v10.4s, v12.8h, v1.h[2]
+            smlsl2      v11.4s, v14.8h, v1.h[3]
+            smlsl       v10.4s, v13.4h, v2.h[2]
+            smlsl       v11.4s, v15.4h, v2.h[3]
+            smlal2      v10.4s, v13.8h, v3.h[2]
+            smlal2      v11.4s, v15.8h, v3.h[3]
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
+            sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x14]
+            ld1         {v14.8h,v15.8h}, [x15]
+
+            smull       v10.4s, v12.4h, v0.h[4]
+            smull       v11.4s, v14.4h, v0.h[5]
+            smlsl2      v10.4s, v12.8h, v1.h[4]
+            smlsl2      v11.4s, v14.8h, v1.h[5]
+            smlsl       v10.4s, v13.4h, v2.h[4]
+            smlsl       v11.4s, v15.4h, v2.h[5]
+            smlal2      v10.4s, v13.8h, v3.h[4]
+            smlal2      v11.4s, v15.8h, v3.h[5]
+
+            /* And two more...  */
+            ld1         {v12.8h,v13.8h}, [x16]
+            ld1         {v14.8h,v15.8h}, [x17]
+
+            subs        x13, x13, #LOOP_OUTPUT_SIZE
+
+            sqrshrn     v9.4h, v10.4s, #15
+            sqrshrn2    v9.8h, v11.4s, #15
+
+            smull       v10.4s, v12.4h, v0.h[6]
+            smull       v11.4s, v14.4h, v0.h[7]
+            smlsl2      v10.4s, v12.8h, v1.h[6]
+            smlsl2      v11.4s, v14.8h, v1.h[7]
+            smlsl       v10.4s, v13.4h, v2.h[6]
+            smlsl       v11.4s, v15.4h, v2.h[7]
+            smlal2      v10.4s, v13.8h, v3.h[6]
+            smlal2      v11.4s, v15.8h, v3.h[7]
+
+            sqrshrn     v10.4h, v10.4s, #15
+            sqrshrn2    v10.8h, v11.4s, #15
+
+            sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
+            sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
+.endif
+            bgt         2b      /* continue inner loop */
+            /* The inner loop has already been limited to ensure that none of
+             * the earlier iterations could overfill the output, so the store
+             * appears within the loop but after the conditional branch (at the
+             * top).  At the end, provided it won't overfill, perform the final
+             * store here.  If it would, then break out to the tricky tail case
+             * instead.
+             */
+            blt         1f
+            /* Store the amount of data appropriate to the configuration of the
+             * instance being assembled.
+             */
+.if LOOP_OUTPUT_SIZE == 4
+            st1         {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+            st1         {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+            st1         {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+            st1         {v8.16b,v9.16b}, [x0], #32
+.endif
+            b           1b              /* resume outer loop */
+            /* Partial tail store case:
+             * Different versions of the code need different subsets of the
+             * following partial stores.  Here the number of components and the
+             * size of the chunk of data produced by each inner loop iteration
+             * is tested to figure out whether or not each phrase is relevant.
+             */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1:          tst         x13, #16
+            beq         1f
+            st1         {v8.16b}, [x0], #16
+            mov         v8.16b, v9.16b
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1:          tst         x13, #8
+            beq         1f
+            st1         {v8.8b}, [x0], #8
+            ext         v8.16b, v8.16b, v8.16b, #8
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1:          tst         x13, #4
+            beq         1f
+            st1         {v8.s}[0], [x0], #4
+            ext         v8.8b, v8.8b, v8.8b, #4
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1:          tst         x13, #2
+            beq         1f
+            st1         {v8.h}[0], [x0], #2
+            ext         v8.8b, v8.8b, v8.8b, #2
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1:          tst         x13, #1
+            beq         1f
+            st1         {v8.b}[0], [x0], #1
+.endif
+1:
+9:          mov         sp, x19
+            ld1         {v8.1d - v11.1d}, [sp], #32
+            ld1         {v12.1d - v15.1d}, [sp], #32
+            ldr         x19, [sp], #16
+            ret
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
+
+.rodata
+intrinsic_resize_consts:          .hword      0, 1, 2, 3, 4, 5, 6, 7