aboutsummaryrefslogtreecommitdiff
path: root/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
diff options
context:
space:
mode:
Diffstat (limited to 'renderscript-toolkit/src/main/cpp/Resize_advsimd.S')
-rw-r--r--renderscript-toolkit/src/main/cpp/Resize_advsimd.S754
1 files changed, 754 insertions, 0 deletions
diff --git a/renderscript-toolkit/src/main/cpp/Resize_advsimd.S b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
new file mode 100644
index 0000000..59e735c
--- /dev/null
+++ b/renderscript-toolkit/src/main/cpp/Resize_advsimd.S
@@ -0,0 +1,754 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
+#define END(f) .size f, .-f;
+
+/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
+ * integer (bicubic has a little overshoot). It would also be possible to add
+ * a temporary DC bias to eliminate the sign bit for more precision, but that's
+ * extra arithmetic.
+ */
+.set VERTBITS, 14
+
+/* The size of the scratch buffer in which we store our vertically convolved
+ * intermediates.
+ */
+.set CHUNKSHIFT, 7 /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
+.set CHUNKSIZE, (1 << CHUNKSHIFT)
+
+/* The number of components processed in a single iteration of the innermost
+ * loop.
+ */
+.set VECSHIFT, 3
+.set VECSIZE, (1<<VECSHIFT)
+
+/* Read four different lines (except at edges where addresses may be clamped,
+ * which is why we don't simply take base and stride registers), and multiply
+ * and accumulate them by the coefficients in v3[0..3], leaving the results in
+ * v12. This gives eight 16-bit results representing a horizontal line of 2-8
+ * input pixels (depending on number of components per pixel) to be fed into
+ * the horizontal scaling pass.
+ *
+ * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
+ * known to represent negative values and VMLS is used to implement this).
+ * Output is VERTBITS signed fixed-point, which must leave room for a little
+ * v12. This gives eight 16-bit results.
+ */
+.macro vert8, dstlo=v12.4h, dsthi=v12.8h
+ ld1 {v8.8b}, [x4], #8
+ ld1 {v9.8b}, [x5], #8
+ ld1 {v10.8b}, [x6], #8
+ ld1 {v11.8b}, [x7], #8
+ uxtl v8.8h, v8.8b
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ umull v12.4s, v9.4h, v3.h[1]
+ umull2 v13.4s, v9.8h, v3.h[1]
+ umlsl v12.4s, v8.4h, v3.h[0]
+ umlsl2 v13.4s, v8.8h, v3.h[0]
+ umlal v12.4s, v10.4h, v3.h[2]
+ umlal2 v13.4s, v10.8h, v3.h[2]
+ umlsl v12.4s, v11.4h, v3.h[3]
+ umlsl2 v13.4s, v11.8h, v3.h[3]
+
+ /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
+ * minus VERTBITS (the number of fraction bits we want to keep from
+ * here on).
+ */
+ sqshrn \dstlo, v12.4s, #8 + (16 - VERTBITS)
+ sqshrn2 \dsthi, v13.4s, #8 + (16 - VERTBITS)
+.endm
+
+/* As above, but only four 16-bit results into v12hi.
+ */
+.macro vert4, dst=v12.8h
+ ld1 {v8.s}[0], [x4], #4
+ ld1 {v9.s}[0], [x5], #4
+ ld1 {v10.s}[0], [x6], #4
+ ld1 {v11.s}[0], [x7], #4
+ uxtl v8.8h, v8.8b
+ uxtl v9.8h, v9.8b
+ uxtl v10.8h, v10.8b
+ uxtl v11.8h, v11.8b
+ umull v12.4s, v9.4h, v3.h[1]
+ umlsl v12.4s, v8.4h, v3.h[0]
+ umlal v12.4s, v10.4h, v3.h[2]
+ umlsl v12.4s, v11.4h, v3.h[3]
+.ifc \dst,v12.8h
+ sqshrn2 \dst, v12.4s, #8 + (16 - VERTBITS)
+.else
+ sqshrn \dst, v12.4s, #8 + (16 - VERTBITS)
+.endif
+.endm
+
+
+/* During horizontal resize having CHUNKSIZE input available means being able
+ * to produce a varying amount of output, depending on the phase of the data.
+ * This function calculates the minimum number of VECSIZE chunks extracted from
+ * a CHUNKSIZE window (x1), and the threshold value for when the count will be
+ * one higher than that (x0).
+ * These work out, conveniently, to be the quotient and remainder from:
+ * (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
+ *
+ * The two values are packed together in a uint64_t for convenience; and
+ * they are, in fact, used this way as an arithmetic short-cut later on.
+ */
+/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
+ENTRY(rsdIntrinsicResize_oscctl_K)
+ lsl x2, x0, #VECSHIFT
+ mov x0, #(CHUNKSIZE << 16) - 1
+ add x0, x0, x2
+ udiv x1, x0, x2
+ msub x0, x1, x2, x0
+ add x0, x0, x1, LSL #32
+ ret
+END(rsdIntrinsicResize_oscctl_K)
+
+/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
+ * For the most part the vertical pass (the outer loop) is the same for all
+ * versions. Exceptions are handled in-line with conditional assembly.
+ */
+.irp comp, 1, 2, 4
+.if \comp == 1
+.set COMPONENT_SHIFT, 0
+.elseif \comp == 2
+.set COMPONENT_SHIFT, 1
+.elseif \comp == 4
+.set COMPONENT_SHIFT, 2
+.else
+.error "Unknown component count"
+.endif
+.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
+.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
+
+.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
+
+/* void rsdIntrinsicResizeB1_K(
+ * uint8_t * restrict dst, // x0
+ * size_t count, // x1
+ * uint32_t xf, // x2
+ * uint32_t xinc, // x3
+ * uint8_t const * restrict srcn, // x4
+ * uint8_t const * restrict src0, // x5
+ * uint8_t const * restrict src1, // x6
+ * uint8_t const * restrict src2, // x7
+ * size_t xclip, // [sp,#0] -> [sp,#80] -> x12
+ * size_t avail, // [sp,#8] -> [sp,#88] -> x11
+ * uint64_t osc_ctl, // [sp,#16] -> [sp,#96] -> x10
+ * int32 const *yr, // [sp,#24] -> [sp,#104] -> v4 (copied to v3 for scalar access)
+ */
+ENTRY(rsdIntrinsicResizeB\comp\()_K)
+ sub x8, sp, #48
+ sub sp, sp, #80
+ st1 {v8.1d - v11.1d}, [sp]
+ st1 {v12.1d - v15.1d}, [x8]
+ str x19, [x8, #32]
+
+ /* align the working buffer on the stack to make it easy to use bit
+ * twiddling for address calculations.
+ */
+ sub x12, sp, #BUFFER_SIZE
+ bic x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
+
+ ldr x8, [sp,#104] // yr
+ adrp x9, intrinsic_resize_consts
+ add x9, x9, :lo12:intrinsic_resize_consts
+ ld1 {v4.4s}, [x8]
+ ld1 {v5.8h}, [x9]
+ sqxtun v4.4h, v4.4s // yr
+ dup v6.8h, w2
+ dup v7.8h, w3
+ mla v6.8h, v5.8h, v7.8h // vxf
+ shl v7.8h, v7.8h, #VECSHIFT // vxinc
+
+ /* Compute starting condition for oscillator used to compute ahead
+ * of time how many iterations are possible before needing to
+ * refill the working buffer. This is based on the fixed-point
+ * index of the last element in the vector of pixels processed in
+ * each iteration, counting up until it would overflow.
+ */
+ sub x8, x2, x3
+ lsl x9, x3, #VECSHIFT
+ add x8, x8, x9
+
+ ldr x10, [sp,#96] // osc_ctl
+ ldp x13,x11, [sp,#80] // xclip, avail
+
+ mov x19, sp
+ mov sp, x12
+
+ /* x4-x7 contain pointers to the four lines of input to be
+ * convolved. These pointers have been clamped vertically and
+ * horizontally (which is why it's not a simple row/stride pair),
+ * and the xclip argument (now in x13) indicates how many pixels
+ * from true the x position of the pointer is. This value should
+ * be 0, 1, or 2 only.
+ *
+ * Start by placing four pixels worth of input at the far end of
+ * the buffer. As many as two of these may be clipped, so four
+ * pixels are fetched, and then the first pixel is duplicated and
+ * the data shifted according to xclip. The source pointers are
+ * then also adjusted according to xclip so that subsequent fetches
+ * match.
+ */
+ mov v3.8b, v4.8b /* make y coeffs available for vert4 and vert8 macros */
+ sub x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
+ add x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
+ add x14, x14, #4 * COMPONENT_COUNT * 2
+.if \comp == 1
+ vert4 v12.4h
+ dup v11.4h, v12.h[0]
+ st1 {v11.4h,v12.4h}, [x12]
+ ld1 {v12.4h}, [x14]
+ st1 {v12.4h}, [x15]
+.elseif \comp == 2
+ vert8
+ dup v11.4s, v12.s[0]
+ st1 {v11.8h,v12.8h}, [x12]
+ ld1 {v12.8h}, [x14]
+ st1 {v12.8h}, [x15]
+.elseif \comp == 4
+ vert8 v14.4h, v14.8h
+ vert8 v15.4h, v15.8h
+ dup v12.2d, v14.d[0]
+ dup v13.2d, v14.d[0]
+ st1 {v12.8h,v13.8h}, [x12], #32
+ st1 {v14.8h,v15.8h}, [x12]
+ sub x12, x12, #32
+ ld1 {v11.8h,v12.8h}, [x14]
+ st1 {v11.8h,v12.8h}, [x15]
+.endif
+ /* Count off four pixels into the working buffer.
+ */
+ sub x11, x11, #4
+ /* Incoming pointers were to the first _legal_ pixel. Four pixels
+ * were read unconditionally, but some may have been discarded by
+ * xclip, so we rewind the pointers to compensate.
+ */
+ sub x4, x4, x13, LSL #(COMPONENT_SHIFT)
+ sub x5, x5, x13, LSL #(COMPONENT_SHIFT)
+ sub x6, x6, x13, LSL #(COMPONENT_SHIFT)
+ sub x7, x7, x13, LSL #(COMPONENT_SHIFT)
+
+ /* First tap starts where we just pre-filled, at the end of the
+ * buffer.
+ */
+ add x2, x2, #(CHUNKSIZE * 2 - 4) << 16
+
+ /* Use overflowing arithmetic to implement wraparound array
+ * indexing.
+ */
+ lsl x2, x2, #(47 - CHUNKSHIFT)
+ lsl x3, x3, #(47 - CHUNKSHIFT)
+
+
+ /* Start of outermost loop.
+ * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
+ * number of iterations of the inner loop that can be performed and
+ * get into that.
+ *
+ * The fill is complicated by the possibility of running out of
+ * input before the scratch buffer is filled. If this isn't a risk
+ * then it's handled by the simple loop at 2:, otherwise the
+ * horrible loop at 3:.
+ */
+1: mov v3.8b, v4.8b /* put y scaling coefficients somewhere handy */
+ subs x11, x11, #CHUNKSIZE
+ bge 2f /* if at least CHUNKSIZE are available... */
+ add x11, x11, #CHUNKSIZE /* if they're not... */
+ b 4f
+ /* basic fill loop, processing 8 bytes at a time until there are
+ * fewer than eight bytes available.
+ */
+3: vert8
+ sub x11, x11, #8 / COMPONENT_COUNT
+ st1 {v12.8h}, [x12], #16
+4: cmp x11, #8 / COMPONENT_COUNT - 1
+ bgt 3b
+.if \comp == 4
+ blt 3f
+ /* The last pixel (four bytes) if necessary */
+ vert4
+.else
+ cmp x11, #1
+ blt 3f
+ /* The last pixels if necessary */
+ sub x4, x4, #8
+ sub x5, x5, #8
+ sub x6, x6, #8
+ sub x7, x7, #8
+ add x4, x4, x11, LSL #(COMPONENT_SHIFT)
+ add x5, x5, x11, LSL #(COMPONENT_SHIFT)
+ add x6, x6, x11, LSL #(COMPONENT_SHIFT)
+ add x7, x7, x11, LSL #(COMPONENT_SHIFT)
+ vert8
+ sub x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
+ sub sp, sp, #32
+ sub x11, x11, #16
+.if \comp == 1
+ dup v13.8h, v12.h[7]
+.elseif \comp == 2
+ dup v13.4s, v12.s[3]
+.endif
+ st1 {v12.8h,v13.8h}, [sp]
+ ld1 {v12.8h}, [x11]
+ add sp, sp, #32
+ b 4f
+.endif
+ /* Keep filling until we get to the end of this chunk of the buffer */
+3:
+.if \comp == 1
+ dup v12.8h, v12.h[7]
+.elseif \comp == 2
+ dup v12.4s, v12.s[3]
+.elseif \comp == 4
+ dup v12.2d, v12.d[1]
+.endif
+4: st1 {v12.8h}, [x12], #16
+ tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 3b
+ b 4f
+
+.align 4
+2: /* Quickly pull a chunk of data into the working buffer.
+ */
+ vert8
+ st1 {v12.8h}, [x12], #16
+ vert8
+ st1 {v12.8h}, [x12], #16
+ tst x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
+ bne 2b
+ cmp x11, #0
+ bne 3f
+4: /* if we end with 0 pixels left we'll have nothing handy to spread
+ * across to the right, so we rewind a bit.
+ */
+ mov x11, #1
+ sub x4, x4, #COMPONENT_COUNT
+ sub x5, x5, #COMPONENT_COUNT
+ sub x6, x6, #COMPONENT_COUNT
+ sub x7, x7, #COMPONENT_COUNT
+3: /* copy four taps (width of cubic window) to far end for overflow
+ * address handling
+ */
+ sub x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+ eor x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ ld1 {v14.4h}, [x13]
+.elseif \comp == 2
+ ld1 {v14.8h}, [x13]
+.elseif \comp == 4
+ ld1 {v14.8h,v15.8h}, [x13]
+.endif
+ add x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
+.if \comp == 1
+ st1 {v14.4h}, [x13]
+.elseif \comp == 2
+ st1 {v14.8h}, [x13]
+.elseif \comp == 4
+ st1 {v14.8h,v15.8h}, [x13]
+.endif
+ /* The high 32-bits of x10 contains the maximum possible iteration
+ * count, but if x8 is greater than the low 32-bits of x10 then
+ * this indicates that the count must be reduced by one for this
+ * iteration to avoid reading past the end of the available data.
+ */
+ sub x13, x10, x8
+ lsr x13, x13, #32
+
+ madd x8, x13, x9, x8
+ sub x8, x8, #(CHUNKSIZE << 16)
+
+ /* prefer to count pixels, rather than vectors, to clarify the tail
+ * store case on exit.
+ */
+ lsl x13, x13, #VECSHIFT
+ cmp x13, x1
+ csel x13, x1, x13, gt
+
+ sub x1, x1, x13
+
+ lsl x13, x13, #COMPONENT_SHIFT
+
+ mov w14, #0x8000
+ movi v30.8h, #3
+ dup v31.8h, w14
+
+ cmp x13, #0
+ bgt 3f
+ cmp x1, #0
+ bgt 1b /* an extreme case where we shouldn't use code in this structure */
+ b 9f
+
+ .align 4
+2: /* Inner loop continues here, but starts at 3:, see end of loop
+ * below for explanation. */
+.if LOOP_OUTPUT_SIZE == 4
+ st1 {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+ st1 {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+ st1 {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+ st1 {v8.16b,v9.16b}, [x0], #32
+.endif
+ /* Inner loop: here the four x coefficients for each tap are
+ * calculated in vector code, and the addresses are calculated in
+ * scalar code, and these calculations are interleaved.
+ */
+3: ushr v8.8h, v6.8h, #1 // sxf
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ sqrdmulh v9.8h, v8.8h, v8.8h // sxf**2
+ add x2, x2, x3
+ sqrdmulh v10.8h, v9.8h, v8.8h // sxf**3
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ sshll v11.4s, v9.4h, #2
+ sshll2 v12.4s, v9.8h, #2
+ add x2, x2, x3
+ smlsl v11.4s, v10.4h, v30.4h
+ smlsl2 v12.4s, v10.8h, v30.8h
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+
+ shadd v0.8h, v10.8h, v8.8h
+ add x2, x2, x3
+ sub v0.8h, v9.8h, v0.8h
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+
+ saddw v1.4s, v11.4s, v9.4h
+ saddw2 v13.4s, v12.4s, v9.8h
+ add x2, x2, x3
+ shrn v1.4h, v1.4s, #1
+ shrn2 v1.8h, v13.4s, #1
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ sub v1.8h, v1.8h, v31.8h
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+
+ saddw v2.4s, v11.4s, v8.4h
+ saddw2 v13.4s, v12.4s, v8.8h
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ shrn v2.4h, v2.4s, #1
+ shrn2 v2.8h, v13.4s, #1
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ neg v2.8h, v2.8h
+
+ shsub v3.8h, v10.8h, v9.8h
+
+ /* increment the x fractional parts (oveflow is ignored, as the
+ * scalar arithmetic shadows this addition with full precision).
+ */
+ add v6.8h, v6.8h, v7.8h
+
+ /* At this point we have four pointers in x8-x11, pointing to the
+ * four taps in the scratch buffer that must be convolved together
+ * to produce an output pixel (one output pixel per pointer).
+ * These pointers usually overlap, but their spacing is irregular
+ * so resolving the redundancy through L1 is a pragmatic solution.
+ *
+ * The scratch buffer is made of signed 16-bit data, holding over
+ * some extra precision, and overshoot, from the vertical pass.
+ *
+ * We also have the 16-bit unsigned fixed-point weights for each
+ * of the four taps in v0 - v3. That's eight pixels worth of
+ * coefficients when we have only four pointers, so calculations
+ * for four more pixels are interleaved with the fetch and permute
+ * code for each variant in the following code.
+ *
+ * The data arrangement is less than ideal for any pixel format,
+ * but permuting loads help to mitigate most of the problems.
+ *
+ * Note also that the two outside taps of a bicubic are negative,
+ * but these coefficients are unsigned. The sign is hard-coded by
+ * use of multiply-and-subtract operations.
+ */
+.if \comp == 1
+ /* The uchar 1 case.
+ * Issue one lanewise ld4.h to load four consecutive pixels from
+ * one pointer (one pixel) into four different registers; then load
+ * four consecutive s16 values from the next pointer (pixel) into
+ * the next lane of those four registers, etc., so that we finish
+ * with v12 - v15 representing the four taps, and each lane
+ * representing a separate pixel.
+ *
+ * The first ld4 uses a splat to avoid any false dependency on
+ * the previous state of the register.
+ */
+ ld4r {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[1], [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[2], [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[3], [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.h,v13.h,v14.h,v15.h}[4], [x14]
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ ld4 {v12.h,v13.h,v14.h,v15.h}[5], [x15]
+ ld4 {v12.h,v13.h,v14.h,v15.h}[6], [x16]
+ ld4 {v12.h,v13.h,v14.h,v15.h}[7], [x17]
+
+ smull v8.4s, v12.4h, v0.4h
+ smull2 v9.4s, v12.8h, v0.8h
+ smlsl v8.4s, v13.4h, v1.4h
+ smlsl2 v9.4s, v13.8h, v1.8h
+ smlsl v8.4s, v14.4h, v2.4h
+ smlsl2 v9.4s, v14.8h, v2.8h
+ smlal v8.4s, v15.4h, v3.4h
+ smlal2 v9.4s, v15.8h, v3.8h
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v8.4h, v8.4s, #15
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+.elseif \comp == 2
+ /* The uchar2 case:
+ * This time load pairs of values into adjacent lanes in v12 - v15
+ * by aliasing them as u32 data; leaving room for only four pixels,
+ * so the process has to be done twice. This also means that the
+ * coefficient registers fail to align with the coefficient data
+ * (eight separate pixels), so that has to be doubled-up to match.
+ */
+ ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ /* double-up coefficients to align with component pairs */
+ zip1 v16.8h, v0.8h, v0.8h
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ zip1 v17.8h, v1.8h, v1.8h
+ zip1 v18.8h, v2.8h, v2.8h
+ zip1 v19.8h, v3.8h, v3.8h
+
+ smull v8.4s, v12.4h, v16.4h
+ smull2 v9.4s, v12.8h, v16.8h
+ smlsl v8.4s, v13.4h, v17.4h
+ smlsl2 v9.4s, v13.8h, v17.8h
+ smlsl v8.4s, v14.4h, v18.4h
+ smlsl2 v9.4s, v14.8h, v18.8h
+ smlal v8.4s, v15.4h, v19.4h
+ smlal2 v9.4s, v15.8h, v19.8h
+
+ sqrshrn v8.4h, v8.4s, #15
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ ld4r {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x15]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x16]
+ ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x17]
+
+ /* double-up coefficients to align with component pairs */
+ zip2 v16.8h, v0.8h, v0.8h
+ zip2 v17.8h, v1.8h, v1.8h
+ zip2 v18.8h, v2.8h, v2.8h
+ zip2 v19.8h, v3.8h, v3.8h
+
+ smull v10.4s, v12.4h, v16.4h
+ smull2 v11.4s, v12.8h, v16.8h
+ smlsl v10.4s, v13.4h, v17.4h
+ smlsl2 v11.4s, v13.8h, v17.8h
+ smlsl v10.4s, v14.4h, v18.4h
+ smlsl2 v11.4s, v14.8h, v18.8h
+ smlal v10.4s, v15.4h, v19.4h
+ smlal2 v11.4s, v15.8h, v19.8h
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+ sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
+.elseif \comp == 4
+ /* The uchar4 case.
+ * This case is comparatively painless because four s16s are the
+ * smallest addressable unit for a vmul-by-scalar. Rather than
+ * permute the data, simply arrange the multiplies to suit the way
+ * the data comes in. That's a lot of data, though, so things
+ * progress in pairs of pixels at a time.
+ */
+ ld1 {v12.8h,v13.8h}, [x14]
+ lsr x14, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld1 {v14.8h,v15.8h}, [x15]
+ add x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
+ lsr x15, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ smull v8.4s, v12.4h, v0.h[0]
+ smull v9.4s, v14.4h, v0.h[1]
+ smlsl2 v8.4s, v12.8h, v1.h[0]
+ smlsl2 v9.4s, v14.8h, v1.h[1]
+ smlsl v8.4s, v13.4h, v2.h[0]
+ smlsl v9.4s, v15.4h, v2.h[1]
+ smlal2 v8.4s, v13.8h, v3.h[0]
+ smlal2 v9.4s, v15.8h, v3.h[1]
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x16]
+ add x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
+ lsr x16, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+ ld1 {v14.8h,v15.8h}, [x17]
+ add x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
+ lsr x17, x2, #(63 - CHUNKSHIFT)
+ add x2, x2, x3
+
+ sqrshrn v8.4h, v8.4s, #15
+ add x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
+ sqrshrn2 v8.8h, v9.4s, #15
+
+ smull v10.4s, v12.4h, v0.h[2]
+ smull v11.4s, v14.4h, v0.h[3]
+ smlsl2 v10.4s, v12.8h, v1.h[2]
+ smlsl2 v11.4s, v14.8h, v1.h[3]
+ smlsl v10.4s, v13.4h, v2.h[2]
+ smlsl v11.4s, v15.4h, v2.h[3]
+ smlal2 v10.4s, v13.8h, v3.h[2]
+ smlal2 v11.4s, v15.8h, v3.h[3]
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ sqrshrun v8.8b, v8.8h, #VERTBITS - 8
+ sqrshrun2 v8.16b, v9.8h, #VERTBITS - 8
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x14]
+ ld1 {v14.8h,v15.8h}, [x15]
+
+ smull v10.4s, v12.4h, v0.h[4]
+ smull v11.4s, v14.4h, v0.h[5]
+ smlsl2 v10.4s, v12.8h, v1.h[4]
+ smlsl2 v11.4s, v14.8h, v1.h[5]
+ smlsl v10.4s, v13.4h, v2.h[4]
+ smlsl v11.4s, v15.4h, v2.h[5]
+ smlal2 v10.4s, v13.8h, v3.h[4]
+ smlal2 v11.4s, v15.8h, v3.h[5]
+
+ /* And two more... */
+ ld1 {v12.8h,v13.8h}, [x16]
+ ld1 {v14.8h,v15.8h}, [x17]
+
+ subs x13, x13, #LOOP_OUTPUT_SIZE
+
+ sqrshrn v9.4h, v10.4s, #15
+ sqrshrn2 v9.8h, v11.4s, #15
+
+ smull v10.4s, v12.4h, v0.h[6]
+ smull v11.4s, v14.4h, v0.h[7]
+ smlsl2 v10.4s, v12.8h, v1.h[6]
+ smlsl2 v11.4s, v14.8h, v1.h[7]
+ smlsl v10.4s, v13.4h, v2.h[6]
+ smlsl v11.4s, v15.4h, v2.h[7]
+ smlal2 v10.4s, v13.8h, v3.h[6]
+ smlal2 v11.4s, v15.8h, v3.h[7]
+
+ sqrshrn v10.4h, v10.4s, #15
+ sqrshrn2 v10.8h, v11.4s, #15
+
+ sqrshrun v9.8b, v9.8h, #VERTBITS - 8
+ sqrshrun2 v9.16b, v10.8h, #VERTBITS - 8
+.endif
+ bgt 2b /* continue inner loop */
+ /* The inner loop has already been limited to ensure that none of
+ * the earlier iterations could overfill the output, so the store
+ * appears within the loop but after the conditional branch (at the
+ * top). At the end, provided it won't overfill, perform the final
+ * store here. If it would, then break out to the tricky tail case
+ * instead.
+ */
+ blt 1f
+ /* Store the amount of data appropriate to the configuration of the
+ * instance being assembled.
+ */
+.if LOOP_OUTPUT_SIZE == 4
+ st1 {v8.s}[0], [x0], #4
+.elseif LOOP_OUTPUT_SIZE == 8
+ st1 {v8.8b}, [x0], #8
+.elseif LOOP_OUTPUT_SIZE == 16
+ st1 {v8.16b}, [x0], #16
+.elseif LOOP_OUTPUT_SIZE == 32
+ st1 {v8.16b,v9.16b}, [x0], #32
+.endif
+ b 1b /* resume outer loop */
+ /* Partial tail store case:
+ * Different versions of the code need different subsets of the
+ * following partial stores. Here the number of components and the
+ * size of the chunk of data produced by each inner loop iteration
+ * is tested to figure out whether or not each phrase is relevant.
+ */
+.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
+1: tst x13, #16
+ beq 1f
+ st1 {v8.16b}, [x0], #16
+ mov v8.16b, v9.16b
+.endif
+.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
+1: tst x13, #8
+ beq 1f
+ st1 {v8.8b}, [x0], #8
+ ext v8.16b, v8.16b, v8.16b, #8
+.endif
+.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
+1: tst x13, #4
+ beq 1f
+ st1 {v8.s}[0], [x0], #4
+ ext v8.8b, v8.8b, v8.8b, #4
+.endif
+.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
+1: tst x13, #2
+ beq 1f
+ st1 {v8.h}[0], [x0], #2
+ ext v8.8b, v8.8b, v8.8b, #2
+.endif
+.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
+1: tst x13, #1
+ beq 1f
+ st1 {v8.b}[0], [x0], #1
+.endif
+1:
+9: mov sp, x19
+ ld1 {v8.1d - v11.1d}, [sp], #32
+ ld1 {v12.1d - v15.1d}, [sp], #32
+ ldr x19, [sp], #16
+ ret
+END(rsdIntrinsicResizeB\comp\()_K)
+.endr
+
+.rodata
+intrinsic_resize_consts: .hword 0, 1, 2, 3, 4, 5, 6, 7