aboutsummaryrefslogtreecommitdiff
path: root/jidctfst.S
blob: 34e1c24fd9000a9637e4d9cf57d4b46750d1e65d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
/*
 * Copyright (C) 2008 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <machine/cpu-features.h>

    .text
    .align

    .global jpeg_idct_ifast
    .func   jpeg_idct_ifast

// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15

// jpeg_idct_ifast (j_decompress_ptr       cinfo,
//                 jpeg_component_info *   compptr,
//                 short*                  coef_block,
//                 unsigned char*          output_buf,
//                 int                     output_col)

#define  local_TMP0123       sp
#define  local_TMP0          [sp, #0]
#define  local_TMP1          [sp, #4]
#define  local_TMP2          [sp, #8]
#define  local_TMP3          [sp, #12]
#define  local_RANGE_TABLE   [sp, #16]
#define  local_OUTPUT_COL    [sp, #20]
#define  local_OUTPUT_BUF    [sp, #24]
#define  local_UNUSED        [sp, #28]
#define  off_WORKSPACE       32
#define  local_WORKSPACE     [sp, #offWORKSPACE]
#define  local_SIZE          (off_WORKSPACE + 8*8*4)

#define  off_DECOMPRESS_range_limit_base  324
#define  off_COMPINFO_quanttable          80

#define  DCTSIZE   8
#define  VY(x)   ((x)*DCTSIZE*2)
#define  QY(x)   ((x)*DCTSIZE*4)

#define  VX(x)   ((x)*2)
#define  QX(x)   ((x)*4)

#define  FIX_1_414213562    #362
#define  FIX_1_082392200    #277
#define  FIX_1_847759065    #473
#define  FIX_2_613125930    #669

#define  RANGE_MASK   1023



jpeg_idct_ifast:
    PLD     (r2, #0)
    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
    ldr     r4, [sp, #4*10]
    sub     sp, #local_SIZE

    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
    str     r4, local_OUTPUT_COL
    str     r3, local_OUTPUT_BUF
    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
    add     r5, r5, #128
    str     r5, local_RANGE_TABLE
    mov     fp, r2                                      // fp = coef_block
    add     ip, sp, #off_WORKSPACE

VLoopTail:
    ldrsh    r0, [fp, #VY(0)]
    ldrsh    r1, [fp, #VY(1)]
    ldrsh    r2, [fp, #VY(2)]
    ldrsh    r3, [fp, #VY(3)]
    ldrsh    r4, [fp, #VY(4)]
    ldrsh    r5, [fp, #VY(5)]
    ldrsh    r6, [fp, #VY(6)]
    ldrsh    r7, [fp, #VY(7)]

    cmp      r1, #0
    orreqs   r8, r2, r3
    orreqs   r8, r4, r5
    orreqs   r8, r6, r7
    beq      VLoopHeadZero

VLoopHead:
    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
    // tmp10 = tmp0 + tmp2   (r0)
    // tmp11 = tmp0 - tmp2   (r4)

    ldr      r9, [r10, #QY(4)]
    ldr      r8, [r10, #QY(0)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
    smulbb   r4, r9, r4
    smlabb   r0, r8, r0, r4
#else
    mul      r4, r9, r4
    mul      r0, r8, r0
    add      r0, r4
#endif
    ldr      r9, [r10, #QY(6)]
    ldr      r8, [r10, #QY(2)]
    sub      r4, r0, r4, lsl #1
#if __ARM_HAVE_HALFWORD_MULTIPLY
    smulbb   r6, r9, r6
    smlabb   r2, r8, r2, r6
#else
    mul      r6, r9, r6
    mul      r2, r8, r2
    add      r2, r6
#endif

    // tmp13 = tmp1 + tmp3                                       (r2)
    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
    // FIX_1_4142... = 362 = 45*8 + 2
    sub      r6, r2, r6, lsl #1
    mov      r8, #360
    add      r8, r8, #2
    mul      r9, r6, r8

    // tmp0 = tmp10 + tmp13;   (r0)
    // tmp3 = tmp10 - tmp13;   (r8)
    // tmp1 = tmp11 + tmp12;   (r4)
    // tmp2 = tmp11 - tmp12;   (r6)
    add     r0, r0, r2
    rsb     r6, r2, r9, asr #8
    sub     r8, r0, r2, lsl #1
    add     r4, r4, r6
    sub     r6, r4, r6, lsl #1

    stmia   local_TMP0123, {r0, r4, r6, r8}

    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above

    // odd part
    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
    // z13 = tmp6 + tmp5;  (r0)
    // z10 = tmp6 - tmp5;  (r2)
    // z11 = tmp4 + tmp7;  (r4)
    // z12 = tmp4 - tmp7;  (r6)

    ldr     r2, [r10, #QY(1)]
    ldr     r9, [r10, #QY(5)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
    smulbb  r1, r2, r1
#else
    mul     r1, r2, r1
#endif
    ldr     r2, [r10, #QY(3)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
    smulbb  r5, r9, r5
#else
    mul     r5, r9, r5
#endif
    ldr     r9, [r10, #QY(7)]
#if __ARM_HAVE_HALFWORD_MULTIPLY
    smlabb  r0, r2, r3, r5
    smlabb  r4, r9, r7, r1
#else
    mul     r0, r2, r3
    add     r0, r5
    mul     r4, r9, r7
    add     r4, r1
#endif
    rsb  r2, r0, r5, lsl #1
    rsb  r6, r4, r1, lsl #1

    // tmp7 = z11 + z13;                             (r7)
    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
    // FIX_... = 360 + 2
    add   r7, r4, r0
    sub   r1, r4, r0
    mov   r8, #360
    add   r8, r8, #2
    mul   r1, r8, r1

    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
    // FIX_1_8477... = 473 = 472 + 1
    // FIX_1_082...  = 277 = 276 + 1
    // FIX_2_...     = 669 = 668 + 1
    add     r8, r2, r6
    mov     r9, #472
    mla     r8, r9, r8, r8
    mov     r9, #276
    mla     r0, r6, r9, r6
    mov     r9, #668
    mla     r2, r9, r2, r2
    sub     r0, r0, r8
    rsb     r2, r2, r8

    // tmp6 = tmp12 - tmp7;  (r6)
    // tmp5 = tmp11 - tmp6;  (r5)
    // tmp4 = tmp10 + tmp5;  (r4)
    rsb  r6, r7, r2, asr #8
    rsb  r5, r6, r1, asr #8
    add  r4, r5, r0, asr #8

    ldmia local_TMP0123, {r0, r1, r2, r3}

    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);

    add   r0, r0, r7
    sub   r7, r0, r7, lsl #1
    add   r1, r1, r6
    sub   r6, r1, r6, lsl #1
    add   r2, r2, r5
    sub   r5, r2, r5, lsl #1
    sub   r3, r3, r4
    add   r4, r3, r4, lsl #1

    str   r0, [ip, #QY(0)]
    str   r1, [ip, #QY(1)]
    str   r2, [ip, #QY(2)]
    str   r3, [ip, #QY(3)]
    str   r4, [ip, #QY(4)]
    str   r5, [ip, #QY(5)]
    str   r6, [ip, #QY(6)]
    str   r7, [ip, #QY(7)]

    // inptr++;                    /* advance pointers to next column */
    // quantptr++;
    // wsptr++;
    add  fp, fp, #2
    add  r10, r10, #4
    add  ip, ip, #4
    add  r0, sp, #(off_WORKSPACE + 4*8)
    cmp  ip, r0
    bne  VLoopTail



HLoopStart:
    // reset pointers
    PLD     (sp, #off_WORKSPACE)
    add     ip, sp, #off_WORKSPACE
    ldr     r10, local_RANGE_TABLE

HLoopTail:
    // output = *output_buf++ + output_col
    ldr      r0, local_OUTPUT_BUF
    ldr      r1, local_OUTPUT_COL
    ldr      r2, [r0], #4
    str      r0, local_OUTPUT_BUF
    add      fp, r2, r1

    PLD      (ip, #32)
    ldmia    ip!, {r0-r7}

    cmp      r1, #0
    orreqs   r8, r2, r3
    orreqs   r8, r4, r5
    orreqs   r8, r6, r7
    beq      HLoopTailZero

HLoopHead:
    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
    add     r0, r0, r4
    sub     r4, r0, r4, lsl #1

    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
    // FIX_... = 360 + 2
    add     r2, r2, r6
    sub     r6, r2, r6, lsl #1
    mov     r8, #360
    add     r8, r8, #2
    mul     r6, r8, r6

    // tmp0 = tmp10 + tmp13;   (r0)
    // tmp3 = tmp10 - tmp13;   (r8)
    // tmp1 = tmp11 + tmp12;   (r4)
    // tmp2 = tmp11 - tmp12;   (r6)
    add     r0, r0, r2
    rsb     r6, r2, r6, asr #8
    sub     r8, r0, r2, lsl #1
    add     r4, r4, r6
    sub     r6, r4, r6, lsl #1

    stmia   local_TMP0123, {r0, r4, r6, r8}

    // Odd part

    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
    add  r0, r5, r3
    sub  r2, r5, r3
    add  r4, r1, r7
    sub  r6, r1, r7

    // tmp7 = z11 + z13;                             (r7)
    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
    // FIX_... = 360 + 2
    add   r7, r4, r0
    sub   r1, r4, r0
    mov   r8, #360
    add   r8, r8, #2
    mul   r1, r8, r1

    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
    // FIX_1_8477... = 473 = 472 + 1
    // FIX_1_082...  = 277 = 276 + 1
    // FIX_2_...     = 669 = 668 + 1
    add  r8, r2, r6
    mov  r9, #472
    mla  r8, r9, r8, r8
    mov  r9, #276
    mla  r0, r6, r9, r6
    mov  r9, #668
    mla  r2, r9, r2, r2
    sub  r0, r0, r8
    sub  r2, r8, r2

    // tmp6 = tmp12 - tmp7;  (r6)
    // tmp5 = tmp11 - tmp6;  (r5)
    // tmp4 = tmp10 + tmp5;  (r4)
    rsb  r6, r7, r2, asr #8
    rsb  r5, r6, r1, asr #8
    add  r4, r5, r0, asr #8

    ldmia local_TMP0123, {r0, r1, r2, r3}

    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];

    mov    r8, #128
    add    r0, r0, r7
    sub    r7, r0, r7, lsl #1
    add    r0, r8, r0, asr #5
    add    r7, r8, r7, asr #5
    add    r1, r1, r6
    sub    r6, r1, r6, lsl #1
    add    r1, r8, r1, asr #5
    add    r6, r8, r6, asr #5
    add    r2, r2, r5
    sub    r5, r2, r5, lsl #1
    add    r2, r8, r2, asr #5
    add    r5, r8, r5, asr #5
    sub    r3, r3, r4
    add    r4, r3, r4, lsl #1
    add    r3, r8, r3, asr #5
    add    r4, r8, r4, asr #5

#if __ARM_ARCH__ >= 6
    usat   r0, #8, r0
    usat   r1, #8, r1
    usat   r2, #8, r2
    usat   r3, #8, r3
    usat   r4, #8, r4
    usat   r5, #8, r5
    usat   r6, #8, r6
    usat   r7, #8, r7
#else
    cmp    r0, #255
    mvnhi  r0, r0, asr #31
    andhi  r0, #255
    cmp    r7, #255
    mvnhi  r7, r7, asr #31
    cmp    r1, #255
    mvnhi  r1, r1, asr #31
    andhi  r1, #255
    cmp    r6, #255
    mvnhi  r6, r6, asr #31
    andhi  r6, #255
    cmp    r2, #255
    mvnhi  r2, r2, asr #31
    andhi  r2, #255
    cmp    r5, #255
    mvnhi  r5, r5, asr #31
    andhi  r5, #255
    cmp    r3, #255
    mvnhi  r3, r3, asr #31
    cmp    r4, #255
    mvnhi  r4, r4, asr #31
    andhi  r4, #255
#endif

    // r3 r2 r1 r0
    orr    r0, r0, r1, lsl #8
    orr    r0, r0, r2, lsl #16
    orr    r0, r0, r3, lsl #24

    // r7 r6 r5 r4
    orr    r1, r4, r5, lsl #8
    orr    r1, r1, r6, lsl #16
    orr    r1, r1, r7, lsl #24
    stmia  fp, {r0, r1}

    add    r0, sp, #(off_WORKSPACE + 8*8*4)
    cmp    ip, r0
    bne    HLoopTail

Exit:
    add    sp, sp, #local_SIZE
    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
    bx     lr


VLoopHeadZero:
// ok, all AC coefficients are 0
    ldr      r1, [r10, #QY(0)]
    add      fp, fp, #2
    add      r10, r10, #4
    mul      r0, r1, r0
    str      r0, [ip, #QY(0)]
    str      r0, [ip, #QY(1)]
    str      r0, [ip, #QY(2)]
    str      r0, [ip, #QY(3)]
    str      r0, [ip, #QY(4)]
    str      r0, [ip, #QY(5)]
    str      r0, [ip, #QY(6)]
    str      r0, [ip, #QY(7)]
    add      ip, ip, #4
    add      r0, sp, #(off_WORKSPACE + 4*8)
    cmp      ip, r0
    beq      HLoopStart
    b        VLoopTail

HLoopTailZero:
    mov      r0, r0, asr #5
    add      r0, #128

#if __ARM_ARCH__ >= 6
    usat     r0, #8, r0
#else
    cmp      r0, #255
    mvnhi    r0, r0, asr #31
    andhi    r0, r0, #255
#endif

    orr      r0, r0, lsl #8
    orr      r0, r0, lsl #16
    mov      r1, r0
    stmia    fp, {r0, r1}

    add      r0, sp, #(off_WORKSPACE + 64*4)
    cmp      ip, r0
    beq      Exit
    b        HLoopTail

    .endfunc