aboutsummaryrefslogtreecommitdiff
path: root/jidctfst.S
diff options
context:
space:
mode:
Diffstat (limited to 'jidctfst.S')
-rw-r--r--jidctfst.S452
1 files changed, 452 insertions, 0 deletions
diff --git a/jidctfst.S b/jidctfst.S
new file mode 100644
index 0000000..88fb661
--- /dev/null
+++ b/jidctfst.S
@@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ .text
+ .align
+
+ .global jpeg_idct_ifast
+ .func jpeg_idct_ifast
+
+// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
+
+// jpeg_idct_ifast (j_decompress_ptr cinfo,
+// jpeg_component_info * compptr,
+// short* coef_block,
+// unsigned char* output_buf,
+// int output_col)
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__)
+#define ARMv6 1
+#endif
+
+#define local_TMP0123 sp
+#define local_TMP0 [sp, #0]
+#define local_TMP1 [sp, #4]
+#define local_TMP2 [sp, #8]
+#define local_TMP3 [sp, #12]
+#define local_RANGE_TABLE [sp, #16]
+#define local_OUTPUT_COL [sp, #20]
+#define local_OUTPUT_BUF [sp, #24]
+#define local_UNUSED [sp, #28]
+#define off_WORKSPACE 32
+#define local_WORKSPACE [sp, #offWORKSPACE]
+#define local_SIZE (off_WORKSPACE + 8*8*4)
+
+#define off_DECOMPRESS_range_limit_base 324
+#define off_COMPINFO_quanttable 80
+
+#define DCTSIZE 8
+#define VY(x) ((x)*DCTSIZE*2)
+#define QY(x) ((x)*DCTSIZE*4)
+
+#define VX(x) ((x)*2)
+#define QX(x) ((x)*4)
+
+#define FIX_1_414213562 #362
+#define FIX_1_082392200 #277
+#define FIX_1_847759065 #473
+#define FIX_2_613125930 #669
+
+#define RANGE_MASK 1023
+
+
+
+jpeg_idct_ifast:
+ pld [r2, #0]
+ stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+ ldr r4, [sp, #4*10]
+ sub sp, #local_SIZE
+
+ ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable
+ str r4, local_OUTPUT_COL
+ str r3, local_OUTPUT_BUF
+ ldr r5, [r0, #off_DECOMPRESS_range_limit_base]
+ add r5, r5, #128
+ str r5, local_RANGE_TABLE
+ mov fp, r2 // fp = coef_block
+ add ip, sp, #off_WORKSPACE
+
+VLoopTail:
+ ldrsh r0, [fp, #VY(0)]
+ ldrsh r1, [fp, #VY(1)]
+ ldrsh r2, [fp, #VY(2)]
+ ldrsh r3, [fp, #VY(3)]
+ ldrsh r4, [fp, #VY(4)]
+ ldrsh r5, [fp, #VY(5)]
+ ldrsh r6, [fp, #VY(6)]
+ ldrsh r7, [fp, #VY(7)]
+
+ cmp r1, #0
+ orreqs r8, r2, r3
+ orreqs r8, r4, r5
+ orreqs r8, r6, r7
+ beq VLoopHeadZero
+
+VLoopHead:
+ // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0)
+ // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4)
+ // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2)
+ // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6)
+ // tmp10 = tmp0 + tmp2 (r0)
+ // tmp11 = tmp0 - tmp2 (r4)
+
+ ldr r9, [r10, #QY(4)]
+ ldr r8, [r10, #QY(0)]
+ smulbb r4, r9, r4
+ smlabb r0, r8, r0, r4
+ ldr r9, [r10, #QY(6)]
+ ldr r8, [r10, #QY(2)]
+ sub r4, r0, r4, lsl #1
+ smulbb r6, r9, r6
+ smlabb r2, r8, r2, r6
+
+ // tmp13 = tmp1 + tmp3 (r2)
+ // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6)
+ // FIX_1_4142... = 362 = 45*8 + 2
+ sub r6, r2, r6, lsl #1
+ mov r8, #360
+ add r8, r8, #2
+ mul r9, r6, r8
+
+ // tmp0 = tmp10 + tmp13; (r0)
+ // tmp3 = tmp10 - tmp13; (r8)
+ // tmp1 = tmp11 + tmp12; (r4)
+ // tmp2 = tmp11 - tmp12; (r6)
+ add r0, r0, r2
+ rsb r6, r2, r9, asr #8
+ sub r8, r0, r2, lsl #1
+ add r4, r4, r6
+ sub r6, r4, r6, lsl #1
+
+ stmia local_TMP0123, {r0, r4, r6, r8}
+
+ // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
+
+ // odd part
+ // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1)
+ // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5)
+ // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3)
+ // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7)
+ // z13 = tmp6 + tmp5; (r0)
+ // z10 = tmp6 - tmp5; (r2)
+ // z11 = tmp4 + tmp7; (r4)
+ // z12 = tmp4 - tmp7; (r6)
+
+ ldr r2, [r10, #QY(1)]
+ ldr r9, [r10, #QY(5)]
+ smulbb r1, r2, r1
+ ldr r2, [r10, #QY(3)]
+ smulbb r5, r9, r5
+ ldr r9, [r10, #QY(7)]
+ smlabb r0, r2, r3, r5
+ smlabb r4, r9, r7, r1
+ rsb r2, r0, r5, lsl #1
+ rsb r6, r4, r1, lsl #1
+
+ // tmp7 = z11 + z13; (r7)
+ // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+ // FIX_... = 360 + 2
+ add r7, r4, r0
+ sub r1, r4, r0
+ mov r8, #360
+ add r8, r8, #2
+ mul r1, r8, r1
+
+ // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
+ // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
+ // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
+ // FIX_1_8477... = 473 = 472 + 1
+ // FIX_1_082... = 277 = 276 + 1
+ // FIX_2_... = 669 = 668 + 1
+ add r8, r2, r6
+ mov r9, #472
+ mla r8, r9, r8, r8
+ mov r9, #276
+ mla r0, r6, r9, r6
+ mov r9, #668
+ mla r2, r9, r2, r2
+ sub r0, r0, r8
+ rsb r2, r2, r8
+
+ // tmp6 = tmp12 - tmp7; (r6)
+ // tmp5 = tmp11 - tmp6; (r5)
+ // tmp4 = tmp10 + tmp5; (r4)
+ rsb r6, r7, r2, asr #8
+ rsb r5, r6, r1, asr #8
+ add r4, r5, r0, asr #8
+
+ ldmia local_TMP0123, {r0, r1, r2, r3}
+
+ // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
+ // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
+ // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
+ // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
+ // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
+ // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
+ // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
+ // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
+
+ add r0, r0, r7
+ sub r7, r0, r7, lsl #1
+ add r1, r1, r6
+ sub r6, r1, r6, lsl #1
+ add r2, r2, r5
+ sub r5, r2, r5, lsl #1
+ sub r3, r3, r4
+ add r4, r3, r4, lsl #1
+
+ str r0, [ip, #QY(0)]
+ str r1, [ip, #QY(1)]
+ str r2, [ip, #QY(2)]
+ str r3, [ip, #QY(3)]
+ str r4, [ip, #QY(4)]
+ str r5, [ip, #QY(5)]
+ str r6, [ip, #QY(6)]
+ str r7, [ip, #QY(7)]
+
+ // inptr++; /* advance pointers to next column */
+ // quantptr++;
+ // wsptr++;
+ add fp, fp, #2
+ add r10, r10, #4
+ add ip, ip, #4
+ add r0, sp, #(off_WORKSPACE + 4*8)
+ cmp ip, r0
+ bne VLoopTail
+
+
+
+HLoopStart:
+ // reset pointers
+ pld [sp, #off_WORKSPACE]
+ add ip, sp, #off_WORKSPACE
+ ldr r10, local_RANGE_TABLE
+
+HLoopTail:
+ // output = *output_buf++ + output_col
+ ldr r0, local_OUTPUT_BUF
+ ldr r1, local_OUTPUT_COL
+ ldr r2, [r0], #4
+ str r0, local_OUTPUT_BUF
+ add fp, r2, r1
+
+ pld [ip, #32]
+ ldmia ip!, {r0-r7}
+
+ cmp r1, #0
+ orreqs r8, r2, r3
+ orreqs r8, r4, r5
+ orreqs r8, r6, r7
+ beq HLoopTailZero
+
+HLoopHead:
+ // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0)
+ // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4)
+ add r0, r0, r4
+ sub r4, r0, r4, lsl #1
+
+ // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2)
+ // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6)
+ // FIX_... = 360 + 2
+ add r2, r2, r6
+ sub r6, r2, r6, lsl #1
+ mov r8, #360
+ add r8, r8, #2
+ mul r6, r8, r6
+
+ // tmp0 = tmp10 + tmp13; (r0)
+ // tmp3 = tmp10 - tmp13; (r8)
+ // tmp1 = tmp11 + tmp12; (r4)
+ // tmp2 = tmp11 - tmp12; (r6)
+ add r0, r0, r2
+ rsb r6, r2, r6, asr #8
+ sub r8, r0, r2, lsl #1
+ add r4, r4, r6
+ sub r6, r4, r6, lsl #1
+
+ stmia local_TMP0123, {r0, r4, r6, r8}
+
+ // Odd part
+
+ // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0)
+ // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2)
+ // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4)
+ // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6)
+ add r0, r5, r3
+ sub r2, r5, r3
+ add r4, r1, r7
+ sub r6, r1, r7
+
+ // tmp7 = z11 + z13; (r7)
+ // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
+ // FIX_... = 360 + 2
+ add r7, r4, r0
+ sub r1, r4, r0
+ mov r8, #360
+ add r8, r8, #2
+ mul r1, r8, r1
+
+ // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
+ // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
+ // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
+ // FIX_1_8477... = 473 = 472 + 1
+ // FIX_1_082... = 277 = 276 + 1
+ // FIX_2_... = 669 = 668 + 1
+ add r8, r2, r6
+ mov r9, #472
+ mla r8, r9, r8, r8
+ mov r9, #276
+ mla r0, r6, r9, r6
+ mov r9, #668
+ mla r2, r9, r2, r2
+ sub r0, r0, r8
+ sub r2, r8, r2
+
+ // tmp6 = tmp12 - tmp7; (r6)
+ // tmp5 = tmp11 - tmp6; (r5)
+ // tmp4 = tmp10 + tmp5; (r4)
+ rsb r6, r7, r2, asr #8
+ rsb r5, r6, r1, asr #8
+ add r4, r5, r0, asr #8
+
+ ldmia local_TMP0123, {r0, r1, r2, r3}
+
+ // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
+ // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
+
+ mov r8, #128
+ add r0, r0, r7
+ sub r7, r0, r7, lsl #1
+ add r0, r8, r0, asr #5
+ add r7, r8, r7, asr #5
+ add r1, r1, r6
+ sub r6, r1, r6, lsl #1
+ add r1, r8, r1, asr #5
+ add r6, r8, r6, asr #5
+ add r2, r2, r5
+ sub r5, r2, r5, lsl #1
+ add r2, r8, r2, asr #5
+ add r5, r8, r5, asr #5
+ sub r3, r3, r4
+ add r4, r3, r4, lsl #1
+ add r3, r8, r3, asr #5
+ add r4, r8, r4, asr #5
+
+#ifdef ARMv6
+ usat r0, #8, r0
+ usat r1, #8, r1
+ usat r2, #8, r2
+ usat r3, #8, r3
+ usat r4, #8, r4
+ usat r5, #8, r5
+ usat r6, #8, r6
+ usat r7, #8, r7
+#else
+ cmp r0, #255
+ mvnhi r0, r0, asr #31
+ andhi r0, #255
+ cmp r7, #255
+ mvnhi r7, r7, asr #31
+ cmp r1, #255
+ mvnhi r1, r1, asr #31
+ andhi r1, #255
+ cmp r6, #255
+ mvnhi r6, r6, asr #31
+ andhi r6, #255
+ cmp r2, #255
+ mvnhi r2, r2, asr #31
+ andhi r2, #255
+ cmp r5, #255
+ mvnhi r5, r5, asr #31
+ andhi r5, #255
+ cmp r3, #255
+ mvnhi r3, r3, asr #31
+ cmp r4, #255
+ mvnhi r4, r4, asr #31
+ andhi r4, #255
+#endif
+
+ // r3 r2 r1 r0
+ orr r0, r0, r1, lsl #8
+ orr r0, r0, r2, lsl #16
+ orr r0, r0, r3, lsl #24
+
+ // r7 r6 r5 r4
+ orr r1, r4, r5, lsl #8
+ orr r1, r1, r6, lsl #16
+ orr r1, r1, r7, lsl #24
+ stmia fp, {r0, r1}
+
+ add r0, sp, #(off_WORKSPACE + 8*8*4)
+ cmp ip, r0
+ bne HLoopTail
+
+Exit:
+ add sp, sp, #local_SIZE
+ ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
+ bx lr
+
+
+VLoopHeadZero:
+// ok, all AC coefficients are 0
+ ldr r1, [r10, #QY(0)]
+ add fp, fp, #2
+ add r10, r10, #4
+ mul r0, r1, r0
+ str r0, [ip, #QY(0)]
+ str r0, [ip, #QY(1)]
+ str r0, [ip, #QY(2)]
+ str r0, [ip, #QY(3)]
+ str r0, [ip, #QY(4)]
+ str r0, [ip, #QY(5)]
+ str r0, [ip, #QY(6)]
+ str r0, [ip, #QY(7)]
+ add ip, ip, #4
+ add r0, sp, #(off_WORKSPACE + 4*8)
+ cmp ip, r0
+ beq HLoopStart
+ b VLoopTail
+
+HLoopTailZero:
+ mov r0, r0, asr #5
+ add r0, #128
+
+#ifdef ARMv6
+ usat r0, #8, r0
+#else
+ cmp r0, #255
+ mvnhi r0, r0, asr #31
+ andhi r0, r0, #255
+#endif
+
+ orr r0, r0, lsl #8
+ orr r0, r0, lsl #16
+ mov r1, r0
+ stmia fp, {r0, r1}
+
+ add r0, sp, #(off_WORKSPACE + 64*4)
+ cmp ip, r0
+ beq Exit
+ b HLoopTail
+
+ .endfunc