diff options
Diffstat (limited to 'jidctfst.S')
-rw-r--r-- | jidctfst.S | 476 |
1 files changed, 0 insertions, 476 deletions
diff --git a/jidctfst.S b/jidctfst.S deleted file mode 100644 index 34e1c24..0000000 --- a/jidctfst.S +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Copyright (C) 2008 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <machine/cpu-features.h> - - .text - .align - - .global jpeg_idct_ifast - .func jpeg_idct_ifast - -// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15 - -// jpeg_idct_ifast (j_decompress_ptr cinfo, -// jpeg_component_info * compptr, -// short* coef_block, -// unsigned char* output_buf, -// int output_col) - -#define local_TMP0123 sp -#define local_TMP0 [sp, #0] -#define local_TMP1 [sp, #4] -#define local_TMP2 [sp, #8] -#define local_TMP3 [sp, #12] -#define local_RANGE_TABLE [sp, #16] -#define local_OUTPUT_COL [sp, #20] -#define local_OUTPUT_BUF [sp, #24] -#define local_UNUSED [sp, #28] -#define off_WORKSPACE 32 -#define local_WORKSPACE [sp, #offWORKSPACE] -#define local_SIZE (off_WORKSPACE + 8*8*4) - -#define off_DECOMPRESS_range_limit_base 324 -#define off_COMPINFO_quanttable 80 - -#define DCTSIZE 8 -#define VY(x) ((x)*DCTSIZE*2) -#define QY(x) ((x)*DCTSIZE*4) - -#define VX(x) ((x)*2) -#define QX(x) ((x)*4) - -#define FIX_1_414213562 #362 -#define FIX_1_082392200 #277 -#define FIX_1_847759065 #473 -#define FIX_2_613125930 #669 - -#define RANGE_MASK 1023 - - - -jpeg_idct_ifast: - PLD (r2, #0) - stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} - ldr r4, [sp, #4*10] - sub sp, #local_SIZE - - ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable - str r4, local_OUTPUT_COL - str r3, local_OUTPUT_BUF - ldr r5, [r0, #off_DECOMPRESS_range_limit_base] - add r5, r5, #128 - str r5, local_RANGE_TABLE - mov fp, r2 // fp = coef_block - add ip, sp, #off_WORKSPACE - -VLoopTail: - ldrsh r0, [fp, #VY(0)] - ldrsh r1, [fp, #VY(1)] - ldrsh r2, [fp, #VY(2)] - ldrsh r3, [fp, #VY(3)] - ldrsh r4, [fp, #VY(4)] - ldrsh r5, [fp, #VY(5)] - ldrsh r6, [fp, #VY(6)] - ldrsh r7, [fp, #VY(7)] - - cmp r1, #0 - orreqs r8, r2, r3 - orreqs r8, r4, r5 - orreqs r8, r6, r7 - beq VLoopHeadZero - -VLoopHead: - // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0) - // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4) - // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2) - // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6) - // tmp10 = tmp0 + tmp2 (r0) - // tmp11 = tmp0 - tmp2 (r4) - - ldr r9, [r10, #QY(4)] - ldr r8, [r10, #QY(0)] -#if __ARM_HAVE_HALFWORD_MULTIPLY - smulbb r4, r9, r4 - smlabb r0, r8, r0, r4 -#else - mul r4, r9, r4 - mul r0, r8, r0 - add r0, r4 -#endif - ldr r9, [r10, #QY(6)] - ldr r8, [r10, #QY(2)] - sub r4, r0, r4, lsl #1 -#if __ARM_HAVE_HALFWORD_MULTIPLY - smulbb r6, r9, r6 - smlabb r2, r8, r2, r6 -#else - mul r6, r9, r6 - mul r2, r8, r2 - add r2, r6 -#endif - - // tmp13 = tmp1 + tmp3 (r2) - // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6) - // FIX_1_4142... = 362 = 45*8 + 2 - sub r6, r2, r6, lsl #1 - mov r8, #360 - add r8, r8, #2 - mul r9, r6, r8 - - // tmp0 = tmp10 + tmp13; (r0) - // tmp3 = tmp10 - tmp13; (r8) - // tmp1 = tmp11 + tmp12; (r4) - // tmp2 = tmp11 - tmp12; (r6) - add r0, r0, r2 - rsb r6, r2, r9, asr #8 - sub r8, r0, r2, lsl #1 - add r4, r4, r6 - sub r6, r4, r6, lsl #1 - - stmia local_TMP0123, {r0, r4, r6, r8} - - // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above - - // odd part - // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1) - // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5) - // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3) - // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7) - // z13 = tmp6 + tmp5; (r0) - // z10 = tmp6 - tmp5; (r2) - // z11 = tmp4 + tmp7; (r4) - // z12 = tmp4 - tmp7; (r6) - - ldr r2, [r10, #QY(1)] - ldr r9, [r10, #QY(5)] -#if __ARM_HAVE_HALFWORD_MULTIPLY - smulbb r1, r2, r1 -#else - mul r1, r2, r1 -#endif - ldr r2, [r10, #QY(3)] -#if __ARM_HAVE_HALFWORD_MULTIPLY - smulbb r5, r9, r5 -#else - mul r5, r9, r5 -#endif - ldr r9, [r10, #QY(7)] -#if __ARM_HAVE_HALFWORD_MULTIPLY - smlabb r0, r2, r3, r5 - smlabb r4, r9, r7, r1 -#else - mul r0, r2, r3 - add r0, r5 - mul r4, r9, r7 - add r4, r1 -#endif - rsb r2, r0, r5, lsl #1 - rsb r6, r4, r1, lsl #1 - - // tmp7 = z11 + z13; (r7) - // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) - // FIX_... = 360 + 2 - add r7, r4, r0 - sub r1, r4, r0 - mov r8, #360 - add r8, r8, #2 - mul r1, r8, r1 - - // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) - // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) - // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) - // FIX_1_8477... = 473 = 472 + 1 - // FIX_1_082... = 277 = 276 + 1 - // FIX_2_... = 669 = 668 + 1 - add r8, r2, r6 - mov r9, #472 - mla r8, r9, r8, r8 - mov r9, #276 - mla r0, r6, r9, r6 - mov r9, #668 - mla r2, r9, r2, r2 - sub r0, r0, r8 - rsb r2, r2, r8 - - // tmp6 = tmp12 - tmp7; (r6) - // tmp5 = tmp11 - tmp6; (r5) - // tmp4 = tmp10 + tmp5; (r4) - rsb r6, r7, r2, asr #8 - rsb r5, r6, r1, asr #8 - add r4, r5, r0, asr #8 - - ldmia local_TMP0123, {r0, r1, r2, r3} - - // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7); - // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7); - // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6); - // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6); - // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5); - // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5); - // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4); - // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4); - - add r0, r0, r7 - sub r7, r0, r7, lsl #1 - add r1, r1, r6 - sub r6, r1, r6, lsl #1 - add r2, r2, r5 - sub r5, r2, r5, lsl #1 - sub r3, r3, r4 - add r4, r3, r4, lsl #1 - - str r0, [ip, #QY(0)] - str r1, [ip, #QY(1)] - str r2, [ip, #QY(2)] - str r3, [ip, #QY(3)] - str r4, [ip, #QY(4)] - str r5, [ip, #QY(5)] - str r6, [ip, #QY(6)] - str r7, [ip, #QY(7)] - - // inptr++; /* advance pointers to next column */ - // quantptr++; - // wsptr++; - add fp, fp, #2 - add r10, r10, #4 - add ip, ip, #4 - add r0, sp, #(off_WORKSPACE + 4*8) - cmp ip, r0 - bne VLoopTail - - - -HLoopStart: - // reset pointers - PLD (sp, #off_WORKSPACE) - add ip, sp, #off_WORKSPACE - ldr r10, local_RANGE_TABLE - -HLoopTail: - // output = *output_buf++ + output_col - ldr r0, local_OUTPUT_BUF - ldr r1, local_OUTPUT_COL - ldr r2, [r0], #4 - str r0, local_OUTPUT_BUF - add fp, r2, r1 - - PLD (ip, #32) - ldmia ip!, {r0-r7} - - cmp r1, #0 - orreqs r8, r2, r3 - orreqs r8, r4, r5 - orreqs r8, r6, r7 - beq HLoopTailZero - -HLoopHead: - // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0) - // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4) - add r0, r0, r4 - sub r4, r0, r4, lsl #1 - - // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2) - // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6) - // FIX_... = 360 + 2 - add r2, r2, r6 - sub r6, r2, r6, lsl #1 - mov r8, #360 - add r8, r8, #2 - mul r6, r8, r6 - - // tmp0 = tmp10 + tmp13; (r0) - // tmp3 = tmp10 - tmp13; (r8) - // tmp1 = tmp11 + tmp12; (r4) - // tmp2 = tmp11 - tmp12; (r6) - add r0, r0, r2 - rsb r6, r2, r6, asr #8 - sub r8, r0, r2, lsl #1 - add r4, r4, r6 - sub r6, r4, r6, lsl #1 - - stmia local_TMP0123, {r0, r4, r6, r8} - - // Odd part - - // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0) - // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2) - // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4) - // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6) - add r0, r5, r3 - sub r2, r5, r3 - add r4, r1, r7 - sub r6, r1, r7 - - // tmp7 = z11 + z13; (r7) - // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1) - // FIX_... = 360 + 2 - add r7, r4, r0 - sub r1, r4, r0 - mov r8, #360 - add r8, r8, #2 - mul r1, r8, r1 - - // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8) - // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0) - // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2) - // FIX_1_8477... = 473 = 472 + 1 - // FIX_1_082... = 277 = 276 + 1 - // FIX_2_... = 669 = 668 + 1 - add r8, r2, r6 - mov r9, #472 - mla r8, r9, r8, r8 - mov r9, #276 - mla r0, r6, r9, r6 - mov r9, #668 - mla r2, r9, r2, r2 - sub r0, r0, r8 - sub r2, r8, r2 - - // tmp6 = tmp12 - tmp7; (r6) - // tmp5 = tmp11 - tmp6; (r5) - // tmp4 = tmp10 + tmp5; (r4) - rsb r6, r7, r2, asr #8 - rsb r5, r6, r1, asr #8 - add r4, r5, r0, asr #8 - - ldmia local_TMP0123, {r0, r1, r2, r3} - - // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK]; - // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK]; - // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK]; - // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK]; - // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK]; - // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK]; - // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK]; - // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK]; - - mov r8, #128 - add r0, r0, r7 - sub r7, r0, r7, lsl #1 - add r0, r8, r0, asr #5 - add r7, r8, r7, asr #5 - add r1, r1, r6 - sub r6, r1, r6, lsl #1 - add r1, r8, r1, asr #5 - add r6, r8, r6, asr #5 - add r2, r2, r5 - sub r5, r2, r5, lsl #1 - add r2, r8, r2, asr #5 - add r5, r8, r5, asr #5 - sub r3, r3, r4 - add r4, r3, r4, lsl #1 - add r3, r8, r3, asr #5 - add r4, r8, r4, asr #5 - -#if __ARM_ARCH__ >= 6 - usat r0, #8, r0 - usat r1, #8, r1 - usat r2, #8, r2 - usat r3, #8, r3 - usat r4, #8, r4 - usat r5, #8, r5 - usat r6, #8, r6 - usat r7, #8, r7 -#else - cmp r0, #255 - mvnhi r0, r0, asr #31 - andhi r0, #255 - cmp r7, #255 - mvnhi r7, r7, asr #31 - cmp r1, #255 - mvnhi r1, r1, asr #31 - andhi r1, #255 - cmp r6, #255 - mvnhi r6, r6, asr #31 - andhi r6, #255 - cmp r2, #255 - mvnhi r2, r2, asr #31 - andhi r2, #255 - cmp r5, #255 - mvnhi r5, r5, asr #31 - andhi r5, #255 - cmp r3, #255 - mvnhi r3, r3, asr #31 - cmp r4, #255 - mvnhi r4, r4, asr #31 - andhi r4, #255 -#endif - - // r3 r2 r1 r0 - orr r0, r0, r1, lsl #8 - orr r0, r0, r2, lsl #16 - orr r0, r0, r3, lsl #24 - - // r7 r6 r5 r4 - orr r1, r4, r5, lsl #8 - orr r1, r1, r6, lsl #16 - orr r1, r1, r7, lsl #24 - stmia fp, {r0, r1} - - add r0, sp, #(off_WORKSPACE + 8*8*4) - cmp ip, r0 - bne HLoopTail - -Exit: - add sp, sp, #local_SIZE - ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr} - bx lr - - -VLoopHeadZero: -// ok, all AC coefficients are 0 - ldr r1, [r10, #QY(0)] - add fp, fp, #2 - add r10, r10, #4 - mul r0, r1, r0 - str r0, [ip, #QY(0)] - str r0, [ip, #QY(1)] - str r0, [ip, #QY(2)] - str r0, [ip, #QY(3)] - str r0, [ip, #QY(4)] - str r0, [ip, #QY(5)] - str r0, [ip, #QY(6)] - str r0, [ip, #QY(7)] - add ip, ip, #4 - add r0, sp, #(off_WORKSPACE + 4*8) - cmp ip, r0 - beq HLoopStart - b VLoopTail - -HLoopTailZero: - mov r0, r0, asr #5 - add r0, #128 - -#if __ARM_ARCH__ >= 6 - usat r0, #8, r0 -#else - cmp r0, #255 - mvnhi r0, r0, asr #31 - andhi r0, r0, #255 -#endif - - orr r0, r0, lsl #8 - orr r0, r0, lsl #16 - mov r1, r0 - stmia fp, {r0, r1} - - add r0, sp, #(off_WORKSPACE + 64*4) - cmp ip, r0 - beq Exit - b HLoopTail - - .endfunc |