diff options
Diffstat (limited to 'jidctfst.S')
-rw-r--r-- | jidctfst.S | 452 |
1 files changed, 452 insertions, 0 deletions
diff --git a/jidctfst.S b/jidctfst.S new file mode 100644 index 0000000..88fb661 --- /dev/null +++ b/jidctfst.S @@ -0,0 +1,452 @@ +/* + * Copyright (C) 2008 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + .text + .align + + .global jpeg_idct_ifast + .func jpeg_idct_ifast + +// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15 + +// jpeg_idct_ifast (j_decompress_ptr cinfo, +// jpeg_component_info * compptr, +// short* coef_block, +// unsigned char* output_buf, +// int output_col) + +#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) +#define ARMv6 1 +#endif + +#define local_TMP0123 sp +#define local_TMP0 [sp, #0] +#define local_TMP1 [sp, #4] +#define local_TMP2 [sp, #8] +#define local_TMP3 [sp, #12] +#define local_RANGE_TABLE [sp, #16] +#define local_OUTPUT_COL [sp, #20] +#define local_OUTPUT_BUF [sp, #24] +#define local_UNUSED [sp, #28] +#define off_WORKSPACE 32 +#define local_WORKSPACE [sp, #offWORKSPACE] +#define local_SIZE (off_WORKSPACE + 8*8*4) + +#define off_DECOMPRESS_range_limit_base 324 +#define off_COMPINFO_quanttable 80 + +#define DCTSIZE 8 +#define VY(x) ((x)*DCTSIZE*2) +#define QY(x) ((x)*DCTSIZE*4) + +#define VX(x) ((x)*2) +#define QX(x) ((x)*4) + +#define FIX_1_414213562 #362 +#define FIX_1_082392200 #277 +#define FIX_1_847759065 #473 +#define FIX_2_613125930 #669 + +#define RANGE_MASK 1023 + + + +jpeg_idct_ifast: + pld [r2, #0] + stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} + ldr r4, [sp, #4*10] + sub sp, #local_SIZE + + ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable + str r4, local_OUTPUT_COL + str r3, local_OUTPUT_BUF + ldr r5, [r0, #off_DECOMPRESS_range_limit_base] + add r5, r5, #128 + str r5, local_RANGE_TABLE + mov fp, r2 // fp = coef_block + add ip, sp, #off_WORKSPACE + +VLoopTail: + ldrsh r0, [fp, #VY(0)] + ldrsh r1, [fp, #VY(1)] + ldrsh r2, [fp, #VY(2)] + ldrsh r3, [fp, #VY(3)] + ldrsh r4, [fp, #VY(4)] + ldrsh r5, [fp, #VY(5)] + ldrsh r6, [fp, #VY(6)] + ldrsh r7, [fp, #VY(7)] + + cmp r1, #0 + orreqs r8, r2, r3 + orreqs r8, r4, r5 + orreqs r8, r6, r7 + beq VLoopHeadZero + +VLoopHead: + // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0) + // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4) + // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2) + // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6) + // tmp10 = tmp0 + tmp2 (r0) + // tmp11 = tmp0 - tmp2 (r4) + + ldr r9, [r10, #QY(4)] + ldr r8, [r10, #QY(0)] + smulbb r4, r9, r4 + smlabb r0, r8, r0, r4 + ldr r9, [r10, #QY(6)] + ldr r8, [r10, #QY(2)] + sub r4, r0, r4, lsl #1 + smulbb r6, r9, r6 + smlabb r2, r8, r2, r6 + + // tmp13 = tmp1 + tmp3 (r2) + // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6) + // FIX_1_4142... = 362 = 45*8 + 2 + sub r6, r2, r6, lsl #1 + mov r8, #360 + add r8, r8, #2 + mul r9, r6, r8 + + // tmp0 = tmp10 + tmp13; (r0) + // tmp3 = tmp10 - tmp13; (r8) + // tmp1 = tmp11 + tmp12; (r4) + // tmp2 = tmp11 - tmp12; (r6) + add r0, r0, r2 + rsb r6, r2, r9, asr #8 + sub r8, r0, r2, lsl #1 + add r4, r4, r6 + sub r6, r4, r6, lsl #1 + + stmia local_TMP0123, {r0, r4, r6, r8} + + // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above + + // odd part + // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1) + // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5) + // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3) + // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7) + // z13 = tmp6 + tmp5; (r0) + // z10 = tmp6 - tmp5; (r2) + // z11 = tmp4 + tmp7; (r4) + // z12 = tmp4 - tmp7; (r6) + + ldr r2, [r10, #QY(1)] + ldr r9, [r10, #QY(5)] + smulbb r1, r2, r1 + ldr r2, [r10, #QY(3)] + smulbb r5, r9, r5 + ldr r9, [r10, #QY(7)] + smlabb r0, r2, r3, r5 + smlabb r4, r9, r7, r1 + rsb r2, r0, r5, lsl #1 + rsb r6, r4, r1, lsl #1 + + // tmp7 = z11 + z13; (r7) + // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) + // FIX_... = 360 + 2 + add r7, r4, r0 + sub r1, r4, r0 + mov r8, #360 + add r8, r8, #2 + mul r1, r8, r1 + + // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) + // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) + // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) + // FIX_1_8477... = 473 = 472 + 1 + // FIX_1_082... = 277 = 276 + 1 + // FIX_2_... = 669 = 668 + 1 + add r8, r2, r6 + mov r9, #472 + mla r8, r9, r8, r8 + mov r9, #276 + mla r0, r6, r9, r6 + mov r9, #668 + mla r2, r9, r2, r2 + sub r0, r0, r8 + rsb r2, r2, r8 + + // tmp6 = tmp12 - tmp7; (r6) + // tmp5 = tmp11 - tmp6; (r5) + // tmp4 = tmp10 + tmp5; (r4) + rsb r6, r7, r2, asr #8 + rsb r5, r6, r1, asr #8 + add r4, r5, r0, asr #8 + + ldmia local_TMP0123, {r0, r1, r2, r3} + + // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); + // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); + // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); + // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); + // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); + // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); + // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); + // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); + + add r0, r0, r7 + sub r7, r0, r7, lsl #1 + add r1, r1, r6 + sub r6, r1, r6, lsl #1 + add r2, r2, r5 + sub r5, r2, r5, lsl #1 + sub r3, r3, r4 + add r4, r3, r4, lsl #1 + + str r0, [ip, #QY(0)] + str r1, [ip, #QY(1)] + str r2, [ip, #QY(2)] + str r3, [ip, #QY(3)] + str r4, [ip, #QY(4)] + str r5, [ip, #QY(5)] + str r6, [ip, #QY(6)] + str r7, [ip, #QY(7)] + + // inptr++; /* advance pointers to next column */ + // quantptr++; + // wsptr++; + add fp, fp, #2 + add r10, r10, #4 + add ip, ip, #4 + add r0, sp, #(off_WORKSPACE + 4*8) + cmp ip, r0 + bne VLoopTail + + + +HLoopStart: + // reset pointers + pld [sp, #off_WORKSPACE] + add ip, sp, #off_WORKSPACE + ldr r10, local_RANGE_TABLE + +HLoopTail: + // output = *output_buf++ + output_col + ldr r0, local_OUTPUT_BUF + ldr r1, local_OUTPUT_COL + ldr r2, [r0], #4 + str r0, local_OUTPUT_BUF + add fp, r2, r1 + + pld [ip, #32] + ldmia ip!, {r0-r7} + + cmp r1, #0 + orreqs r8, r2, r3 + orreqs r8, r4, r5 + orreqs r8, r6, r7 + beq HLoopTailZero + +HLoopHead: + // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0) + // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4) + add r0, r0, r4 + sub r4, r0, r4, lsl #1 + + // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2) + // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6) + // FIX_... = 360 + 2 + add r2, r2, r6 + sub r6, r2, r6, lsl #1 + mov r8, #360 + add r8, r8, #2 + mul r6, r8, r6 + + // tmp0 = tmp10 + tmp13; (r0) + // tmp3 = tmp10 - tmp13; (r8) + // tmp1 = tmp11 + tmp12; (r4) + // tmp2 = tmp11 - tmp12; (r6) + add r0, r0, r2 + rsb r6, r2, r6, asr #8 + sub r8, r0, r2, lsl #1 + add r4, r4, r6 + sub r6, r4, r6, lsl #1 + + stmia local_TMP0123, {r0, r4, r6, r8} + + // Odd part + + // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0) + // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2) + // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4) + // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6) + add r0, r5, r3 + sub r2, r5, r3 + add r4, r1, r7 + sub r6, r1, r7 + + // tmp7 = z11 + z13; (r7) + // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) + // FIX_... = 360 + 2 + add r7, r4, r0 + sub r1, r4, r0 + mov r8, #360 + add r8, r8, #2 + mul r1, r8, r1 + + // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) + // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) + // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) + // FIX_1_8477... = 473 = 472 + 1 + // FIX_1_082... = 277 = 276 + 1 + // FIX_2_... = 669 = 668 + 1 + add r8, r2, r6 + mov r9, #472 + mla r8, r9, r8, r8 + mov r9, #276 + mla r0, r6, r9, r6 + mov r9, #668 + mla r2, r9, r2, r2 + sub r0, r0, r8 + sub r2, r8, r2 + + // tmp6 = tmp12 - tmp7; (r6) + // tmp5 = tmp11 - tmp6; (r5) + // tmp4 = tmp10 + tmp5; (r4) + rsb r6, r7, r2, asr #8 + rsb r5, r6, r1, asr #8 + add r4, r5, r0, asr #8 + + ldmia local_TMP0123, {r0, r1, r2, r3} + + // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK]; + // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK]; + // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK]; + // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK]; + // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK]; + // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK]; + // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK]; + // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK]; + + mov r8, #128 + add r0, r0, r7 + sub r7, r0, r7, lsl #1 + add r0, r8, r0, asr #5 + add r7, r8, r7, asr #5 + add r1, r1, r6 + sub r6, r1, r6, lsl #1 + add r1, r8, r1, asr #5 + add r6, r8, r6, asr #5 + add r2, r2, r5 + sub r5, r2, r5, lsl #1 + add r2, r8, r2, asr #5 + add r5, r8, r5, asr #5 + sub r3, r3, r4 + add r4, r3, r4, lsl #1 + add r3, r8, r3, asr #5 + add r4, r8, r4, asr #5 + +#ifdef ARMv6 + usat r0, #8, r0 + usat r1, #8, r1 + usat r2, #8, r2 + usat r3, #8, r3 + usat r4, #8, r4 + usat r5, #8, r5 + usat r6, #8, r6 + usat r7, #8, r7 +#else + cmp r0, #255 + mvnhi r0, r0, asr #31 + andhi r0, #255 + cmp r7, #255 + mvnhi r7, r7, asr #31 + cmp r1, #255 + mvnhi r1, r1, asr #31 + andhi r1, #255 + cmp r6, #255 + mvnhi r6, r6, asr #31 + andhi r6, #255 + cmp r2, #255 + mvnhi r2, r2, asr #31 + andhi r2, #255 + cmp r5, #255 + mvnhi r5, r5, asr #31 + andhi r5, #255 + cmp r3, #255 + mvnhi r3, r3, asr #31 + cmp r4, #255 + mvnhi r4, r4, asr #31 + andhi r4, #255 +#endif + + // r3 r2 r1 r0 + orr r0, r0, r1, lsl #8 + orr r0, r0, r2, lsl #16 + orr r0, r0, r3, lsl #24 + + // r7 r6 r5 r4 + orr r1, r4, r5, lsl #8 + orr r1, r1, r6, lsl #16 + orr r1, r1, r7, lsl #24 + stmia fp, {r0, r1} + + add r0, sp, #(off_WORKSPACE + 8*8*4) + cmp ip, r0 + bne HLoopTail + +Exit: + add sp, sp, #local_SIZE + ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} + bx lr + + +VLoopHeadZero: +// ok, all AC coefficients are 0 + ldr r1, [r10, #QY(0)] + add fp, fp, #2 + add r10, r10, #4 + mul r0, r1, r0 + str r0, [ip, #QY(0)] + str r0, [ip, #QY(1)] + str r0, [ip, #QY(2)] + str r0, [ip, #QY(3)] + str r0, [ip, #QY(4)] + str r0, [ip, #QY(5)] + str r0, [ip, #QY(6)] + str r0, [ip, #QY(7)] + add ip, ip, #4 + add r0, sp, #(off_WORKSPACE + 4*8) + cmp ip, r0 + beq HLoopStart + b VLoopTail + +HLoopTailZero: + mov r0, r0, asr #5 + add r0, #128 + +#ifdef ARMv6 + usat r0, #8, r0 +#else + cmp r0, #255 + mvnhi r0, r0, asr #31 + andhi r0, r0, #255 +#endif + + orr r0, r0, lsl #8 + orr r0, r0, lsl #16 + mov r1, r0 + stmia fp, {r0, r1} + + add r0, sp, #(off_WORKSPACE + 64*4) + cmp ip, r0 + beq Exit + b HLoopTail + + .endfunc |