renderscript-toolkit/src/main/cpp/Resize_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
#define END(f) .fnend; .size f, .-f;

.eabi_attribute 25,1 @Tag_ABI_align8_preserved
.arm

/* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
 * integer (bicubic has a little overshoot).  It would also be possible to add
 * a temporary DC bias to eliminate the sign bit for more precision, but that's
 * extra arithmetic.
 */
.set VERTBITS, 14

/* The size of the scratch buffer in which we store our vertically convolved
 * intermediates.
 */
.set CHUNKSHIFT, 7
.set CHUNKSIZE, (1 << CHUNKSHIFT)

/* The number of components processed in a single iteration of the innermost
 * loop.
 */
.set VECSHIFT, 3
.set VECSIZE, (1<<VECSHIFT)

/* Read four different lines (except at edges where addresses may be clamped,
 * which is why we don't simply take base and stride registers), and multiply
 * and accumulate them by the coefficients in d6[0..3], leaving the results in
 * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
 * input pixels (depending on number of components per pixel) to be fed into
 * the horizontal scaling pass.
 *
 * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
 * known to represent negative values and VMLS is used to implement this).
 * Output is VERTBITS signed fixed-point, which must leave room for a little
 * bit of overshoot beyond [0,1.0).
 */
.macro vert8, dstlo=d24, dsthi=d25
        vld1.u8     d16, [r4]!
        vld1.u8     d18, [r5]!
        vld1.u8     d20, [r6]!
        vld1.u8     d22, [r7]!
        vmovl.u8    q8, d16
        vmovl.u8    q9, d18
        vmovl.u8    q10, d20
        vmovl.u8    q11, d22
        vmull.u16   q12, d18, d6[1]
        vmull.u16   q13, d19, d6[1]
        vmlsl.u16   q12, d16, d6[0]
        vmlsl.u16   q13, d17, d6[0]
        vmlal.u16   q12, d20, d6[2]
        vmlal.u16   q13, d21, d6[2]
        vmlsl.u16   q12, d22, d6[3]
        vmlsl.u16   q13, d23, d6[3]

        /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
         * minus VERTBITS (the number of fraction bits we want to keep from
         * here on).
         */
        vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
        vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
.endm

/* As above, but only four 16-bit results into d25.
 */
.macro vert4
        vld1.u32    d16[0], [r4]!
        vld1.u32    d18[0], [r5]!
        vld1.u32    d20[0], [r6]!
        vld1.u32    d22[0], [r7]!
        vmovl.u8    q8, d16
        vmovl.u8    q9, d18
        vmovl.u8    q10, d20
        vmovl.u8    q11, d22
        vmull.u16   q12, d18, d6[1]
        vmlsl.u16   q12, d16, d6[0]
        vmlal.u16   q12, d20, d6[2]
        vmlsl.u16   q12, d22, d6[3]
        vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
.endm


/* During horizontal resize having CHUNKSIZE input available means being able
 * to produce a varying amount of output, depending on the phase of the data.
 * This function calculates the minimum number of VECSIZE chunks extracted from
 * a CHUNKSIZE window (r1), and the threshold value for when the count will be
 * one higher than that (r0).
 * These work out, conveniently, to be the quotient and remainder from:
 *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
 *
 * The two values can be packed together in a uint64_t for convenience; and
 * they are, in fact, used this way as an arithmetic short-cut later on.
 */

/* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
ENTRY(rsdIntrinsicResize_oscctl_K)
        lsl         r2, r0, #VECSHIFT
        movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
        movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
        add         r0, r0, r2
#if defined(ARCH_ARM_USE_UDIV)
        udiv        r1, r0, r2
        mls         r0, r1, r2, r0
#else
        clz         r3, r2
        clz         r1, r0
        subs        r3, r3, r1
        movlt       r3, #0
        mov         r1, #1
        lsl         r2, r2, r3
        lsl         r3, r1, r3
        mov         r1, #0
1:      cmp         r2, r0
        addls       r1, r3
        subls       r0, r2
        lsrs        r3, r3, #1
        lsr         r2, r2, #1
        bne         1b
#endif
        bx          lr
END(rsdIntrinsicResize_oscctl_K)

/* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
 * For the most part the vertical pass (the outer loop) is the same for all
 * versions.  Exceptions are handled in-line with conditional assembly.
 */
.irp comp, 1, 2, 4
.if \comp == 1
.set COMPONENT_SHIFT, 0
.elseif \comp == 2
.set COMPONENT_SHIFT, 1
.elseif \comp == 4
.set COMPONENT_SHIFT, 2
.else
.error "Unknown component count"
.endif
.set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
.set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)

.set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
.set OSC_STORE, (BUFFER_SIZE + 0)
.set OSCSTEP_STORE, (BUFFER_SIZE + 4)
.set OSCCTL_STORE, (BUFFER_SIZE + 8)
.set AVAIL_STORE, (BUFFER_SIZE + 16)
.set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */

/* void rsdIntrinsicResizeB\comp\()_K(
 *             uint8_t * restrict dst,          // r0
 *             size_t count,                    // r1
 *             uint32_t xf,                     // r2
 *             uint32_t xinc,                   // r3
 *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
 *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
 *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
 *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
 *             size_t xclip,                    // [sp,#16] -> [sp,#120]
 *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
 *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
 *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
 */
ENTRY(rsdIntrinsicResizeB\comp\()_K)
            push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
            vpush       {d8-d15}

            /* align the working buffer on the stack to make it easy to use bit
             * twiddling for address calculations and bounds tests.
             */
            sub         r12, sp, #BUFFER_SIZE + 32
            mov         lr, sp
            bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
            mov         sp, r12
            str         lr, [sp,#SP_STORE]

            ldr         r8, [lr,#136]           // yr
            adr         r9, 8f
            vld1.s32    {q4}, [r8]
            vld1.s16    {q5}, [r9]
            vqmovun.s32 d8, q4                  // yr
            vdup.s16    q6, r2
            vdup.s16    q7, r3
            vmla.s16    q6, q5, q7              // vxf
            vshl.s16    q7, q7, #VECSHIFT       // vxinc

            ldrd        r4,r5, [lr,#104]        // srcn, src0
            ldrd        r6,r7, [lr,#112]        // src1, src2

            /* Compute starting condition for oscillator used to compute ahead
             * of time how many iterations are possible before needing to
             * refill the working buffer.  This is based on the fixed-point
             * index of the last element in the vector of pixels processed in
             * each iteration, counting up until it would overflow.
             */
            sub         r8, r2, r3
            mov         r9, r3, LSL #VECSHIFT
            add         r8, r8, r9

            ldrd        r10,r11, [lr,#128]      // osc_ctl

            str         r8, [sp,#OSC_STORE]
            str         r9, [sp,#OSCSTEP_STORE]
            str         r10, [sp,#OSCCTL_STORE]
            str         r11, [sp,#OSCCTL_STORE+4]
            ldrd        r10,r11, [lr,#120]      // xclip,avail


            /* r4-r7 contain pointers to the four lines of input to be
             * convolved.  These pointers have been clamped vertically and
             * horizontally (which is why it's not a simple row/stride pair),
             * and the xclip argument (now in r10) indicates how many pixels
             * from true the x position of the pointer is.  This value should
             * be 0, 1, or 2 only.
             *
             * Start by placing four pixels worth of input at the far end of
             * the buffer.  As many as two of these may be clipped, so four
             * pixels are fetched, and then the first pixel is duplicated and
             * the data shifted according to xclip.  The source pointers are
             * then also adjusted according to xclip so that subsequent fetches
             * match.
             */
            vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */

            sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
            add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
            add         r8, r8, #4 * COMPONENT_COUNT * 2
.if \comp == 1
            vert4
            vdup.s16    d24, d25[0]
            vst1.s16    {q12}, [r12]
            vld1.s16    {d24}, [r8]
            vst1.s16    {d24}, [r9]
.elseif \comp == 2
            vert8
            vdup.u32    q11, d24[0]
            vst1.s16    {q11,q12}, [r12]
            vld1.s16    {q12}, [r8]
            vst1.s16    {q12}, [r9]
.elseif \comp == 4
            vert8       d28, d29
            vert8       d30, d31
            vmov.u64    d24, d28
            vmov.u64    d25, d28
            vmov.u64    d26, d28
            vmov.u64    d27, d28
            vst1.s16    {q12,q13}, [r12]!
            vst1.s16    {q14,q15}, [r12]
            sub         r12, r12, #32
            vld1.s16    {q11,q12}, [r8]
            vst1.s16    {q11,q12}, [r9]
.endif
            /* Count off four pixels into the working buffer, and move count to
             * its new home.
             */
            sub         lr, r11, #4
            /* Incoming pointers were to the first _legal_ pixel.  Four pixels
             * were read unconditionally, but some may have been discarded by
             * xclip, so we rewind the pointers to compensate.
             */
            sub         r4, r4, r10, LSL #COMPONENT_SHIFT
            sub         r5, r5, r10, LSL #COMPONENT_SHIFT
            sub         r6, r6, r10, LSL #COMPONENT_SHIFT
            sub         r7, r7, r10, LSL #COMPONENT_SHIFT

            /* First tap starts where we just pre-filled, at the end of the
             * buffer.
             */
            add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16

            /* Use overflowing arithmetic to implement wraparound array
             * indexing.
             */
            mov         r2, r2, LSL #(15 - CHUNKSHIFT)
            mov         r3, r3, LSL #(15 - CHUNKSHIFT)

            str         lr, [sp,#AVAIL_STORE]

            /* Start of outermost loop.
             * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
             * number of iterations of the inner loop that can be performed and
             * get into that.
             *
             * The fill is complicated by the possibility of running out of
             * input before the scratch buffer is filled.  If this isn't a risk
             * then it's handled by the simple loop at 2:, otherwise the
             * horrible loop at 3:.
             */
1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
            vmov        d6, d8              /* put y scaling coefficients somewhere handy */
            subs        lr, #CHUNKSIZE
            bge         2f                  /* if at least CHUNKSIZE are available... */
            add         lr, #CHUNKSIZE      /* if they're not... */
            b           4f
            /* ..just sneaking a literal in here after this unconditional branch.. */
8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
            /* basic fill loop, processing 8 bytes at a time until there are
             * fewer than eight bytes available.
             */
3:          vert8
            sub         lr, lr, #8 / COMPONENT_COUNT
            vst1.s16    {q12}, [r12]!
4:          cmp         lr, #8 / COMPONENT_COUNT - 1
            bgt         3b
.if \comp == 4
            blt         3f
            /* The last pixel (four bytes) if necessary */
            vert4
.else
            cmp         lr, #1
            blt         3f
            /* The last pixels if necessary */
            sub         r4, r4, #8
            sub         r5, r5, #8
            sub         r6, r6, #8
            sub         r7, r7, #8
            add         r4, r4, lr, LSL #COMPONENT_SHIFT
            add         r5, r5, lr, LSL #COMPONENT_SHIFT
            add         r6, r6, lr, LSL #COMPONENT_SHIFT
            add         r7, r7, lr, LSL #COMPONENT_SHIFT
            vert8
            sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
            sub         sp, sp, #32
            sub         lr, lr, #16
.if \comp == 1
            vdup.s16    q13, d25[3]
.elseif \comp == 2
            vdup.u32    q13, d25[1]
.endif
            vst1.s16    {q12,q13}, [sp]
            vld1.s16    {q12}, [lr]
            add         sp, sp, #32
            b           4f
.endif
            /* Keep filling until we get to the end of this chunk of the buffer */
3:
.if \comp == 1
            vdup.s16    q12, d25[3]
.elseif \comp == 2
            vdup.u32    q12, d25[1]
.elseif \comp == 4
            vmov.u64    d24, d25
.endif
4:          vst1.s16    {q12}, [r12]!
            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
            bne         3b
            b           4f

.align 4
2:          /* Quickly pull a chunk of data into the working buffer.
             */
            vert8
            vst1.s16    {q12}, [r12]!
            vert8
            vst1.s16    {q12}, [r12]!
            tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
            bne         2b
            cmp         lr, #0
            bne         3f
4:          /* if we end with 0 pixels left we'll have nothing handy to spread
             * across to the right, so we rewind a bit.
             */
            mov         lr, #1
            sub         r4, r4, #COMPONENT_COUNT
            sub         r5, r5, #COMPONENT_COUNT
            sub         r6, r6, #COMPONENT_COUNT
            sub         r7, r7, #COMPONENT_COUNT
3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
            add         lr, sp, #OSC_STORE
            ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
            ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */

            /* copy four taps (width of cubic window) to far end for overflow
             * address handling
             */
            sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
            eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
.if \comp == 1
            vld1.s16    {d28}, [lr]
.elseif \comp == 2
            vld1.s16    {q14}, [lr]
.elseif \comp == 4
            vld1.s16    {q14,q15}, [lr]
.endif
            add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
.if \comp == 1
            vst1.s16    {d28}, [lr]
.elseif \comp == 2
            vst1.s16    {q14}, [lr]
.elseif \comp == 4
            vst1.s16    {q14,q15}, [lr]
.endif
            /* r11 contains the maximum possible iteration count, but if r8 is
             * greater than r10 then this indicates that the count must be
             * reduced by one for this iteration to avoid reading past the end
             * of the available data.
             */
            cmp             r10, r8
            sbc         lr, r11, #0

            mla         r8, lr, r9, r8
            sub         r8, r8, #(CHUNKSIZE << 16)

            str         r8, [sp,#OSC_STORE]         /* done with osc */

            /* prefer to count pixels, rather than vectors, to clarify the tail
             * store case on exit.
             */
            mov         lr, lr, LSL #VECSHIFT
            cmp         lr, r1
            movgt       lr, r1

            sub         r1, r1, lr

            mov         lr, lr, LSL #COMPONENT_SHIFT

            vmov.i16    d10, #3
            vmov.i16    d11, #0x8000

            cmp         lr, #0
            bgt         3f
            cmp         r1, #0
            bgt         1b     /* an extreme case where we shouldn't use code in this structure */
            b           9f

            .align 4
2:          /* Inner loop continues here, but starts at 3:, see end of loop
             * below for explanation. */
.if LOOP_OUTPUT_SIZE == 4
            vst1.u32    {d16[0]}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 8
            vst1.u8     {d16}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 16
            vst1.u8     {q8}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 32
            vst1.u8     {q8,q9}, [r0]!
.endif
            /* Inner loop:  here the four x coefficients for each tap are
             * calculated in vector code, and the addresses are calculated in
             * scalar code, and these calculations are interleaved.
             */
3:          vshr.u16    q8, q6, #1
            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
            vqrdmulh.s16 q9, q8, q8
            add         r2, r2, r3
            vqrdmulh.s16 q10, q9, q8
            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
            vshll.s16   q11, d18, #2
            vshll.s16   q12, d19, #2
            add         r2, r2, r3
            vmlsl.s16   q11, d20, d10
            vmlsl.s16   q12, d21, d10
            mov         r10, r2, LSR #(31 - CHUNKSHIFT)

            vhadd.s16   q0, q10, q8
            add         r2, r2, r3
            vsub.s16    q0, q9, q0
            mov         r11, r2, LSR #(31 - CHUNKSHIFT)

            vaddw.s16   q1, q11, d18
            vaddw.s16   q13, q12, d19
            add         r2, r2, r3
            vshrn.s32   d2, q1, #1
            vshrn.s32   d3, q13, #1
            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
            vsub.s16    d2, d2, d11
            vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)

            vaddw.s16   q2, q11, d16
            vaddw.s16   q13, q12, d17
            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
            vshrn.s32   d4, q2, #1
            vshrn.s32   d5, q13, #1
            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
            vneg.s16    q2, q2

            vhsub.s16   q3, q10, q9

            /* increment the x fractional parts (oveflow is ignored, as the
             * scalar arithmetic shadows this addition with full precision).
             */
            vadd.s16    q6, q6, q7

            /* At this point we have four pointers in r8-r11, pointing to the
             * four taps in the scratch buffer that must be convolved together
             * to produce an output pixel (one output pixel per pointer).
             * These pointers usually overlap, but their spacing is irregular
             * so resolving the redundancy through L1 is a pragmatic solution.
             *
             * The scratch buffer is made of signed 16-bit data, holding over
             * some extra precision, and overshoot, from the vertical pass.
             *
             * We also have the 16-bit unsigned fixed-point weights for each
             * of the four taps in q0 - q3.  That's eight pixels worth of
             * coefficients when we have only four pointers, so calculations
             * for four more pixels are interleaved with the fetch and permute
             * code for each variant in the following code.
             *
             * The data arrangement is less than ideal for any pixel format,
             * but permuting loads help to mitigate most of the problems.
             *
             * Note also that the two outside taps of a bicubic are negative,
             * but these coefficients are unsigned.  The sign is hard-coded by
             * use of multiply-and-subtract operations.
             */
.if \comp == 1
            /* The uchar 1 case.
             * Issue one lanewise vld4.s16 to load four consecutive pixels from
             * one pointer (one pixel) into four different registers; then load
             * four consecutive s16 values from the next pointer (pixel) into
             * the next lane of those four registers, etc., so that we finish
             * with q12 - q15 representing the four taps, and each lane
             * representing a separate pixel.
             *
             * The first vld4 uses a splat to avoid any false dependency on
             * the previous state of the register.
             */
            vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
            vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
            vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
            vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]

            vmull.s16   q8, d24, d0
            vmull.s16   q9, d25, d1
            vmlsl.s16   q8, d26, d2
            vmlsl.s16   q9, d27, d3
            vmlsl.s16   q8, d28, d4
            vmlsl.s16   q9, d29, d5
            vmlal.s16   q8, d30, d6
            vmlal.s16   q9, d31, d7

            subs        lr, lr, #LOOP_OUTPUT_SIZE

            vqrshrn.s32 d16, q8, #15
            vqrshrn.s32 d17, q9, #15

            vqrshrun.s16 d16, q8, #VERTBITS - 8
.elseif \comp == 2
            /* The uchar2 case:
             * This time load pairs of values into adjacent lanes in q12 - q15
             * by aliasing them as u32 data; leaving room for only four pixels,
             * so the process has to be done twice.  This also means that the
             * coefficient registers fail to align with the coefficient data
             * (eight separate pixels), so that has to be doubled-up to match.
             */
            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3

            /* double-up coefficients to align with component pairs */
            vmov        d20, d0
            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
            vmov        d21, d2
            vmov        d22, d4
            vmov        d23, d6
            vzip.s16    d0, d20
            vzip.s16    d2, d21
            vzip.s16    d4, d22
            vzip.s16    d6, d23

            vmull.s16   q8, d24, d0
            vmull.s16   q9, d25, d20
            vmlsl.s16   q8, d26, d2
            vmlsl.s16   q9, d27, d21
            vmlsl.s16   q8, d28, d4
            vmlsl.s16   q9, d29, d22
            vmlal.s16   q8, d30, d6
            vmlal.s16   q9, d31, d23

            vqrshrn.s32 d16, q8, #15
            vqrshrn.s32 d17, q9, #15

            vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
            vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
            vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
            vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]

            /* double-up coefficients to align with component pairs */
            vmov        d0, d1
            vmov        d2, d3
            vmov        d4, d5
            vmov        d6, d7
            vzip.s16    d0, d1
            vzip.s16    d2, d3
            vzip.s16    d4, d5
            vzip.s16    d6, d7

            vmull.s16   q10, d24, d0
            vmull.s16   q11, d25, d1
            vmlsl.s16   q10, d26, d2
            vmlsl.s16   q11, d27, d3
            vmlsl.s16   q10, d28, d4
            vmlsl.s16   q11, d29, d5
            vmlal.s16   q10, d30, d6
            vmlal.s16   q11, d31, d7

            subs        lr, lr, #LOOP_OUTPUT_SIZE

            vqrshrn.s32 d18, q10, #15
            vqrshrn.s32 d19, q11, #15

            vqrshrun.s16 d16, q8, #VERTBITS - 8
            vqrshrun.s16 d17, q9, #VERTBITS - 8
.elseif \comp == 4
            /* The uchar4 case.
             * This case is comparatively painless because four s16s are the
             * smallest addressable unit for a vmul-by-scalar.  Rather than
             * permute the data, simply arrange the multiplies to suit the way
             * the data comes in.  That's a lot of data, though, so things
             * progress in pairs of pixels at a time.
             */
            vld1.s16    {q12,q13}, [r8]
            mov         r8, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld1.s16    {q14,q15}, [r9]
            add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
            mov         r9, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3

            vmull.s16   q8, d24, d0[0]
            vmull.s16   q9, d28, d0[1]
            vmlsl.s16   q8, d25, d2[0]
            vmlsl.s16   q9, d29, d2[1]
            vmlsl.s16   q8, d26, d4[0]
            vmlsl.s16   q9, d30, d4[1]
            vmlal.s16   q8, d27, d6[0]
            vmlal.s16   q9, d31, d6[1]

            /* And two more...  */
            vld1.s16    {q12,q13}, [r10]
            add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
            mov         r10, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3
            vld1.s16    {q14,q15}, [r11]
            add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
            mov         r11, r2, LSR #(31 - CHUNKSHIFT)
            add         r2, r2, r3

            vqrshrn.s32 d16, q8, #15
            add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
            vqrshrn.s32 d17, q9, #15

            vmull.s16   q10, d24, d0[2]
            vmull.s16   q11, d28, d0[3]
            vmlsl.s16   q10, d25, d2[2]
            vmlsl.s16   q11, d29, d2[3]
            vmlsl.s16   q10, d26, d4[2]
            vmlsl.s16   q11, d30, d4[3]
            vmlal.s16   q10, d27, d6[2]
            vmlal.s16   q11, d31, d6[3]

            vqrshrn.s32 d18, q10, #15
            vqrshrn.s32 d19, q11, #15

            vqrshrun.s16 d16, q8, #VERTBITS - 8
            vqrshrun.s16 d17, q9, #VERTBITS - 8

            /* And two more...  */
            vld1.s16    {q12,q13}, [r8]
            vld1.s16    {q14,q15}, [r9]

            vmull.s16   q10, d24, d1[0]
            vmull.s16   q11, d28, d1[1]
            vmlsl.s16   q10, d25, d3[0]
            vmlsl.s16   q11, d29, d3[1]
            vmlsl.s16   q10, d26, d5[0]
            vmlsl.s16   q11, d30, d5[1]
            vmlal.s16   q10, d27, d7[0]
            vmlal.s16   q11, d31, d7[1]

            /* And two more...  */
            vld1.s16    {q12,q13}, [r10]
            vld1.s16    {q14,q15}, [r11]

            subs        lr, lr, #LOOP_OUTPUT_SIZE

            vqrshrn.s32 d18, q10, #15
            vqrshrn.s32 d19, q11, #15

            vmull.s16   q10, d24, d1[2]
            vmull.s16   q11, d28, d1[3]
            vmlsl.s16   q10, d25, d3[2]
            vmlsl.s16   q11, d29, d3[3]
            vmlsl.s16   q10, d26, d5[2]
            vmlsl.s16   q11, d30, d5[3]
            vmlal.s16   q10, d27, d7[2]
            vmlal.s16   q11, d31, d7[3]

            vqrshrn.s32 d20, q10, #15
            vqrshrn.s32 d21, q11, #15

            vqrshrun.s16 d18, q9, #VERTBITS - 8
            vqrshrun.s16 d19, q10, #VERTBITS - 8
.endif
            bgt         2b      /* continue inner loop */
            /* The inner loop has already been limited to ensure that none of
             * the earlier iterations could overfill the output, so the store
             * appears within the loop but after the conditional branch (at the
             * top).  At the end, provided it won't overfill, perform the final
             * store here.  If it would, then break out to the tricky tail case
             * instead.
             */
            blt         1f
            /* Store the amount of data appropriate to the configuration of the
             * instance being assembled.
             */
.if LOOP_OUTPUT_SIZE == 4
            vst1.u32    {d16[0]}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 8
            vst1.u8     {d16}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 16
            vst1.u8     {q8}, [r0]!
.elseif LOOP_OUTPUT_SIZE == 32
            vst1.u8     {q8,q9}, [r0]!
.endif
            b           1b              /* resume outer loop */
            /* Partial tail store case:
             * Different versions of the code need different subsets of the
             * following partial stores.  Here the number of components and the
             * size of the chunk of data produced by each inner loop iteration
             * is tested to figure out whether or not each phrase is relevant.
             */
.if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
1:          tst         lr, #16
            beq         1f
            vst1.u8     {q8}, [r0]!
            vmov        q8, q9
.endif
.if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
1:          tst         lr, #8
            beq         1f
            vst1.u8     {d16}, [r0]!
            vmov.u8     d16, d17
.endif
.if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
1:          tst         lr, #4
            beq         1f
            vst1.u32    {d16[0]}, [r0]!
            vext.u32    d16, d16, d16, #1
.endif
.if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
1:          tst         lr, #2
            beq         1f
            vst1.u16    {d16[0]}, [r0]!
            vext.u16    d16, d16, d16, #1
.endif
.if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
1:          tst         lr, #1
            beq         1f
            vst1.u8     {d16[0]}, [r0]!
.endif
1:
9:          ldr         sp, [sp,#SP_STORE]
            vpop        {d8-d15}
            pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
END(rsdIntrinsicResizeB\comp\()_K)
.endr