aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChia-chi Yeh <chiachi@android.com>2010-12-10 16:16:23 +0800
committerChia-chi Yeh <chiachi@android.com>2010-12-10 16:18:46 +0800
commitc2cf571b568278c7a347e656bfd2522378b7112d (patch)
treec3d0d4f1ca5962ede4ac28bf1e4100b483243e3a
parent2fa18d458a5545e7f6cc431954f19e136dfedc9e (diff)
downloadjpeg-c2cf571b568278c7a347e656bfd2522378b7112d.tar.gz
libjpeg: Remove the old assembly code for ARM.
A much better one is coming. Change-Id: I60d8c227d573fcbff10af363d69405e9fbd0c147
-rw-r--r--Android.mk19
-rw-r--r--jidctfst.S476
2 files changed, 2 insertions, 493 deletions
diff --git a/Android.mk b/Android.mk
index 9e1c42e..1b2399d 100644
--- a/Android.mk
+++ b/Android.mk
@@ -10,8 +10,8 @@ LOCAL_SRC_FILES := \
jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c \
+ jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+ jquant2.c jutils.c jmemmgr.c
# use ashmem as libjpeg decoder's backing store
LOCAL_CFLAGS += -DUSE_ANDROID_ASHMEM
@@ -23,21 +23,6 @@ LOCAL_SRC_FILES += \
#LOCAL_SRC_FILES += \
# jmem-android.c
-
-# the assembler is only for the ARM version, don't break the Linux sim
-ifneq ($(TARGET_ARCH),arm)
-ANDROID_JPEG_NO_ASSEMBLER := true
-endif
-
-# temp fix until we understand why this broke cnn.com
-#ANDROID_JPEG_NO_ASSEMBLER := true
-
-ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
-LOCAL_SRC_FILES += jidctint.c jidctfst.c
-else
-LOCAL_SRC_FILES += jidctint.c jidctfst.S
-endif
-
LOCAL_CFLAGS += -DAVOID_TABLES
LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
#LOCAL_CFLAGS += -march=armv6j
diff --git a/jidctfst.S b/jidctfst.S
deleted file mode 100644
index 34e1c24..0000000
--- a/jidctfst.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <machine/cpu-features.h>
-
- .text
- .align
-
- .global jpeg_idct_ifast
- .func jpeg_idct_ifast
-
-// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
-
-// jpeg_idct_ifast (j_decompress_ptr cinfo,
-// jpeg_component_info * compptr,
-// short* coef_block,
-// unsigned char* output_buf,
-// int output_col)
-
-#define local_TMP0123 sp
-#define local_TMP0 [sp, #0]
-#define local_TMP1 [sp, #4]
-#define local_TMP2 [sp, #8]
-#define local_TMP3 [sp, #12]
-#define local_RANGE_TABLE [sp, #16]
-#define local_OUTPUT_COL [sp, #20]
-#define local_OUTPUT_BUF [sp, #24]
-#define local_UNUSED [sp, #28]
-#define off_WORKSPACE 32
-#define local_WORKSPACE [sp, #offWORKSPACE]
-#define local_SIZE (off_WORKSPACE + 8*8*4)
-
-#define off_DECOMPRESS_range_limit_base 324
-#define off_COMPINFO_quanttable 80
-
-#define DCTSIZE 8
-#define VY(x) ((x)*DCTSIZE*2)
-#define QY(x) ((x)*DCTSIZE*4)
-
-#define VX(x) ((x)*2)
-#define QX(x) ((x)*4)
-
-#define FIX_1_414213562 #362
-#define FIX_1_082392200 #277
-#define FIX_1_847759065 #473
-#define FIX_2_613125930 #669
-
-#define RANGE_MASK 1023
-
-
-
-jpeg_idct_ifast:
- PLD (r2, #0)
- stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
- ldr r4, [sp, #4*10]
- sub sp, #local_SIZE
-
- ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable
- str r4, local_OUTPUT_COL
- str r3, local_OUTPUT_BUF
- ldr r5, [r0, #off_DECOMPRESS_range_limit_base]
- add r5, r5, #128
- str r5, local_RANGE_TABLE
- mov fp, r2 // fp = coef_block
- add ip, sp, #off_WORKSPACE
-
-VLoopTail:
- ldrsh r0, [fp, #VY(0)]
- ldrsh r1, [fp, #VY(1)]
- ldrsh r2, [fp, #VY(2)]
- ldrsh r3, [fp, #VY(3)]
- ldrsh r4, [fp, #VY(4)]
- ldrsh r5, [fp, #VY(5)]
- ldrsh r6, [fp, #VY(6)]
- ldrsh r7, [fp, #VY(7)]
-
- cmp r1, #0
- orreqs r8, r2, r3
- orreqs r8, r4, r5
- orreqs r8, r6, r7
- beq VLoopHeadZero
-
-VLoopHead:
- // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0)
- // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4)
- // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2)
- // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6)
- // tmp10 = tmp0 + tmp2 (r0)
- // tmp11 = tmp0 - tmp2 (r4)
-
- ldr r9, [r10, #QY(4)]
- ldr r8, [r10, #QY(0)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r4, r9, r4
- smlabb r0, r8, r0, r4
-#else
- mul r4, r9, r4
- mul r0, r8, r0
- add r0, r4
-#endif
- ldr r9, [r10, #QY(6)]
- ldr r8, [r10, #QY(2)]
- sub r4, r0, r4, lsl #1
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r6, r9, r6
- smlabb r2, r8, r2, r6
-#else
- mul r6, r9, r6
- mul r2, r8, r2
- add r2, r6
-#endif
-
- // tmp13 = tmp1 + tmp3 (r2)
- // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6)
- // FIX_1_4142... = 362 = 45*8 + 2
- sub r6, r2, r6, lsl #1
- mov r8, #360
- add r8, r8, #2
- mul r9, r6, r8
-
- // tmp0 = tmp10 + tmp13; (r0)
- // tmp3 = tmp10 - tmp13; (r8)
- // tmp1 = tmp11 + tmp12; (r4)
- // tmp2 = tmp11 - tmp12; (r6)
- add r0, r0, r2
- rsb r6, r2, r9, asr #8
- sub r8, r0, r2, lsl #1
- add r4, r4, r6
- sub r6, r4, r6, lsl #1
-
- stmia local_TMP0123, {r0, r4, r6, r8}
-
- // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
-
- // odd part
- // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1)
- // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5)
- // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3)
- // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7)
- // z13 = tmp6 + tmp5; (r0)
- // z10 = tmp6 - tmp5; (r2)
- // z11 = tmp4 + tmp7; (r4)
- // z12 = tmp4 - tmp7; (r6)
-
- ldr r2, [r10, #QY(1)]
- ldr r9, [r10, #QY(5)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r1, r2, r1
-#else
- mul r1, r2, r1
-#endif
- ldr r2, [r10, #QY(3)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r5, r9, r5
-#else
- mul r5, r9, r5
-#endif
- ldr r9, [r10, #QY(7)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smlabb r0, r2, r3, r5
- smlabb r4, r9, r7, r1
-#else
- mul r0, r2, r3
- add r0, r5
- mul r4, r9, r7
- add r4, r1
-#endif
- rsb r2, r0, r5, lsl #1
- rsb r6, r4, r1, lsl #1
-
- // tmp7 = z11 + z13; (r7)
- // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
- // FIX_... = 360 + 2
- add r7, r4, r0
- sub r1, r4, r0
- mov r8, #360
- add r8, r8, #2
- mul r1, r8, r1
-
- // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
- // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
- // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
- // FIX_1_8477... = 473 = 472 + 1
- // FIX_1_082... = 277 = 276 + 1
- // FIX_2_... = 669 = 668 + 1
- add r8, r2, r6
- mov r9, #472
- mla r8, r9, r8, r8
- mov r9, #276
- mla r0, r6, r9, r6
- mov r9, #668
- mla r2, r9, r2, r2
- sub r0, r0, r8
- rsb r2, r2, r8
-
- // tmp6 = tmp12 - tmp7; (r6)
- // tmp5 = tmp11 - tmp6; (r5)
- // tmp4 = tmp10 + tmp5; (r4)
- rsb r6, r7, r2, asr #8
- rsb r5, r6, r1, asr #8
- add r4, r5, r0, asr #8
-
- ldmia local_TMP0123, {r0, r1, r2, r3}
-
- // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
- // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
- // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
- // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
- // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
- // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
- // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
- // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
- add r0, r0, r7
- sub r7, r0, r7, lsl #1
- add r1, r1, r6
- sub r6, r1, r6, lsl #1
- add r2, r2, r5
- sub r5, r2, r5, lsl #1
- sub r3, r3, r4
- add r4, r3, r4, lsl #1
-
- str r0, [ip, #QY(0)]
- str r1, [ip, #QY(1)]
- str r2, [ip, #QY(2)]
- str r3, [ip, #QY(3)]
- str r4, [ip, #QY(4)]
- str r5, [ip, #QY(5)]
- str r6, [ip, #QY(6)]
- str r7, [ip, #QY(7)]
-
- // inptr++; /* advance pointers to next column */
- // quantptr++;
- // wsptr++;
- add fp, fp, #2
- add r10, r10, #4
- add ip, ip, #4
- add r0, sp, #(off_WORKSPACE + 4*8)
- cmp ip, r0
- bne VLoopTail
-
-
-
-HLoopStart:
- // reset pointers
- PLD (sp, #off_WORKSPACE)
- add ip, sp, #off_WORKSPACE
- ldr r10, local_RANGE_TABLE
-
-HLoopTail:
- // output = *output_buf++ + output_col
- ldr r0, local_OUTPUT_BUF
- ldr r1, local_OUTPUT_COL
- ldr r2, [r0], #4
- str r0, local_OUTPUT_BUF
- add fp, r2, r1
-
- PLD (ip, #32)
- ldmia ip!, {r0-r7}
-
- cmp r1, #0
- orreqs r8, r2, r3
- orreqs r8, r4, r5
- orreqs r8, r6, r7
- beq HLoopTailZero
-
-HLoopHead:
- // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0)
- // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4)
- add r0, r0, r4
- sub r4, r0, r4, lsl #1
-
- // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2)
- // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6)
- // FIX_... = 360 + 2
- add r2, r2, r6
- sub r6, r2, r6, lsl #1
- mov r8, #360
- add r8, r8, #2
- mul r6, r8, r6
-
- // tmp0 = tmp10 + tmp13; (r0)
- // tmp3 = tmp10 - tmp13; (r8)
- // tmp1 = tmp11 + tmp12; (r4)
- // tmp2 = tmp11 - tmp12; (r6)
- add r0, r0, r2
- rsb r6, r2, r6, asr #8
- sub r8, r0, r2, lsl #1
- add r4, r4, r6
- sub r6, r4, r6, lsl #1
-
- stmia local_TMP0123, {r0, r4, r6, r8}
-
- // Odd part
-
- // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0)
- // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2)
- // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4)
- // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6)
- add r0, r5, r3
- sub r2, r5, r3
- add r4, r1, r7
- sub r6, r1, r7
-
- // tmp7 = z11 + z13; (r7)
- // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
- // FIX_... = 360 + 2
- add r7, r4, r0
- sub r1, r4, r0
- mov r8, #360
- add r8, r8, #2
- mul r1, r8, r1
-
- // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
- // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
- // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
- // FIX_1_8477... = 473 = 472 + 1
- // FIX_1_082... = 277 = 276 + 1
- // FIX_2_... = 669 = 668 + 1
- add r8, r2, r6
- mov r9, #472
- mla r8, r9, r8, r8
- mov r9, #276
- mla r0, r6, r9, r6
- mov r9, #668
- mla r2, r9, r2, r2
- sub r0, r0, r8
- sub r2, r8, r2
-
- // tmp6 = tmp12 - tmp7; (r6)
- // tmp5 = tmp11 - tmp6; (r5)
- // tmp4 = tmp10 + tmp5; (r4)
- rsb r6, r7, r2, asr #8
- rsb r5, r6, r1, asr #8
- add r4, r5, r0, asr #8
-
- ldmia local_TMP0123, {r0, r1, r2, r3}
-
- // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
- // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
- // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
- // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
- // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
- // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
- // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
- // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
-
- mov r8, #128
- add r0, r0, r7
- sub r7, r0, r7, lsl #1
- add r0, r8, r0, asr #5
- add r7, r8, r7, asr #5
- add r1, r1, r6
- sub r6, r1, r6, lsl #1
- add r1, r8, r1, asr #5
- add r6, r8, r6, asr #5
- add r2, r2, r5
- sub r5, r2, r5, lsl #1
- add r2, r8, r2, asr #5
- add r5, r8, r5, asr #5
- sub r3, r3, r4
- add r4, r3, r4, lsl #1
- add r3, r8, r3, asr #5
- add r4, r8, r4, asr #5
-
-#if __ARM_ARCH__ >= 6
- usat r0, #8, r0
- usat r1, #8, r1
- usat r2, #8, r2
- usat r3, #8, r3
- usat r4, #8, r4
- usat r5, #8, r5
- usat r6, #8, r6
- usat r7, #8, r7
-#else
- cmp r0, #255
- mvnhi r0, r0, asr #31
- andhi r0, #255
- cmp r7, #255
- mvnhi r7, r7, asr #31
- cmp r1, #255
- mvnhi r1, r1, asr #31
- andhi r1, #255
- cmp r6, #255
- mvnhi r6, r6, asr #31
- andhi r6, #255
- cmp r2, #255
- mvnhi r2, r2, asr #31
- andhi r2, #255
- cmp r5, #255
- mvnhi r5, r5, asr #31
- andhi r5, #255
- cmp r3, #255
- mvnhi r3, r3, asr #31
- cmp r4, #255
- mvnhi r4, r4, asr #31
- andhi r4, #255
-#endif
-
- // r3 r2 r1 r0
- orr r0, r0, r1, lsl #8
- orr r0, r0, r2, lsl #16
- orr r0, r0, r3, lsl #24
-
- // r7 r6 r5 r4
- orr r1, r4, r5, lsl #8
- orr r1, r1, r6, lsl #16
- orr r1, r1, r7, lsl #24
- stmia fp, {r0, r1}
-
- add r0, sp, #(off_WORKSPACE + 8*8*4)
- cmp ip, r0
- bne HLoopTail
-
-Exit:
- add sp, sp, #local_SIZE
- ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
- bx lr
-
-
-VLoopHeadZero:
-// ok, all AC coefficients are 0
- ldr r1, [r10, #QY(0)]
- add fp, fp, #2
- add r10, r10, #4
- mul r0, r1, r0
- str r0, [ip, #QY(0)]
- str r0, [ip, #QY(1)]
- str r0, [ip, #QY(2)]
- str r0, [ip, #QY(3)]
- str r0, [ip, #QY(4)]
- str r0, [ip, #QY(5)]
- str r0, [ip, #QY(6)]
- str r0, [ip, #QY(7)]
- add ip, ip, #4
- add r0, sp, #(off_WORKSPACE + 4*8)
- cmp ip, r0
- beq HLoopStart
- b VLoopTail
-
-HLoopTailZero:
- mov r0, r0, asr #5
- add r0, #128
-
-#if __ARM_ARCH__ >= 6
- usat r0, #8, r0
-#else
- cmp r0, #255
- mvnhi r0, r0, asr #31
- andhi r0, r0, #255
-#endif
-
- orr r0, r0, lsl #8
- orr r0, r0, lsl #16
- mov r1, r0
- stmia fp, {r0, r1}
-
- add r0, sp, #(off_WORKSPACE + 64*4)
- cmp ip, r0
- beq Exit
- b HLoopTail
-
- .endfunc