aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWei-Ta Chen <weita@google.com>2010-10-22 22:56:33 -0700
committerWei-Ta Chen <weita@google.com>2011-01-11 23:12:28 -0800
commit46c05833c257ab46c42182cbb2ae359892d14443 (patch)
treee366f62f3f33e52ae8a4bcc728b510dd64cbc5c1
parente2c386b408e808685833ba162457e6dbbb485d36 (diff)
downloadjpeg-46c05833c257ab46c42182cbb2ae359892d14443.tar.gz
Backport changes/fixes related to Jpeg tile-based decoder from Honeycomb to Gingerbread. Bug: 3309014 ////////////////////////////////////////////////////////////////////////// This is a combination of 5 commits. Fix 3118622, where tile-base jpeg decode does not handle the region width correctly in Progressive JPEG when the h_samp_factor is different from one color component to anothor. To decode a region in a progressive JPEG, each time we decode one iMCU row, the width of which equals to the region width. However, for each color component the region width in DCT blocks depends on its h_samp_factor. The change ensures we get a correct region width from our recorded MCU_column number. Bug: 3118622 Change-Id: I6d3e30f946e0395c0719aee0c8e694824ab3d27f libjpeg: Remove the old assembly code for ARM. A much better one is coming. Change-Id: I60d8c227d573fcbff10af363d69405e9fbd0c147 libjpeg: Use the new fast-and-accurate IDCT method for ARMv6+ devices. As another AA&N implementation, it runs 9-10% faster than jidctfst.S and 11-15% faster than jidctfst.c. As another IDCT method, it runs 17-20% faster than JDCT_ISLOW method and provides the same accuracy or even better. Change-Id: I81783c310d6dac5aaf84c03a4cf20662f466564c libjpeg: Make both JDCT_IFAST and JDCT_ISLOW use armv6_idct. Change-Id: Iae9c402ec7e1c6b078f404fec995162c8091f383 Fix the JPEG tile decode issue in the case of JPEGs having restart markers. The fix stores the restart information on each index point and restores the restart information when we do the tile decode. Bug: 3312406 ////////////////////////////////////////////////////////////////////////////
-rw-r--r--Android.mk22
-rw-r--r--armv6_idct.S366
-rw-r--r--jdcoefct.c33
-rw-r--r--jddctmgr.c59
-rw-r--r--jdhuff.c74
-rw-r--r--jdphuff.c114
-rw-r--r--jidctfst.S476
-rw-r--r--jpeglib.h4
8 files changed, 577 insertions, 571 deletions
diff --git a/Android.mk b/Android.mk
index 9e1c42e..2670652 100644
--- a/Android.mk
+++ b/Android.mk
@@ -10,8 +10,8 @@ LOCAL_SRC_FILES := \
jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
- jfdctint.c jidctflt.c jidctred.c jquant1.c \
- jquant2.c jutils.c jmemmgr.c \
+ jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+ jquant2.c jutils.c jmemmgr.c armv6_idct.S
# use ashmem as libjpeg decoder's backing store
LOCAL_CFLAGS += -DUSE_ANDROID_ASHMEM
@@ -23,21 +23,6 @@ LOCAL_SRC_FILES += \
#LOCAL_SRC_FILES += \
# jmem-android.c
-
-# the assembler is only for the ARM version, don't break the Linux sim
-ifneq ($(TARGET_ARCH),arm)
-ANDROID_JPEG_NO_ASSEMBLER := true
-endif
-
-# temp fix until we understand why this broke cnn.com
-#ANDROID_JPEG_NO_ASSEMBLER := true
-
-ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
-LOCAL_SRC_FILES += jidctint.c jidctfst.c
-else
-LOCAL_SRC_FILES += jidctint.c jidctfst.S
-endif
-
LOCAL_CFLAGS += -DAVOID_TABLES
LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
#LOCAL_CFLAGS += -march=armv6j
@@ -45,6 +30,9 @@ LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
# enable tile based decode
LOCAL_CFLAGS += -DANDROID_TILE_BASED_DECODE
+# enable armv6 idct assembly
+LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT
+
LOCAL_MODULE:= libjpeg
LOCAL_SHARED_LIBRARIES := \
diff --git a/armv6_idct.S b/armv6_idct.S
new file mode 100644
index 0000000..18e4e8a
--- /dev/null
+++ b/armv6_idct.S
@@ -0,0 +1,366 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This is a fast-and-accurate implementation of inverse Discrete Cosine
+ * Transform (IDCT) for ARMv6+. It also performs dequantization of the input
+ * coefficients just like other methods.
+ *
+ * This implementation is based on the scaled 1-D DCT algorithm proposed by
+ * Arai, Agui, and Nakajima. The following code is based on the figure 4-8
+ * on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients
+ * are (almost) directly mapped into registers.
+ *
+ * The accuracy is achieved by using SMULWy and SMLAWy instructions. Both
+ * multiply 32 bits by 16 bits and store the top 32 bits of the result. It
+ * makes 32-bit fixed-point arithmetic possible without overflow. That is
+ * why jpeg_idct_ifast(), which is written in C, cannot be improved.
+ *
+ * More tricks are used to gain more speed. First of all, we use as many
+ * registers as possible. ARM processor has 16 registers including sp (r13)
+ * and pc (r15), so only 14 registers can be used without limitations. In
+ * general, we let r0 to r7 hold the coefficients; r10 and r11 hold four
+ * 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold
+ * intermediate value. In the second pass, r9 is the loop counter. In the
+ * first pass, r8 to r11 are used to hold quantization values, so the loop
+ * counter is held by sp. Yes, the stack pointer. Since it must be aligned
+ * to 4-byte boundary all the time, we align it to 32-byte boundary and use
+ * bit 3 to bit 5. As the result, we actually use 14.1 registers. :-)
+ *
+ * Second, we rearrange quantization values to access them sequentially. The
+ * table is first transposed, and the new columns are placed in the order of
+ * 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a
+ * time. Rearranging coefficients also helps, but that requires to change a
+ * dozen of files, which seems not worth it. In addition, we choose to scale
+ * up quantization values by 13 bits, so the coefficients are scaled up by
+ * 16 bits after both passes. Then we can pack and saturate them two at a
+ * time using PKHTB and USAT16 instructions.
+ *
+ * Third, we reorder the instructions to avoid bubbles in the pipeline. This
+ * is done by hand accroding to the cycle timings and the interlock behavior
+ * described in the technical reference manual of ARM1136JF-S. We also take
+ * advantage of dual issue processors by interleaving instructions with
+ * dependencies. It has been benchmarked on four devices and all the results
+ * showed distinguishable improvements. Note that PLD instructions actually
+ * slow things down, so they are removed at the last minute. In the future,
+ * this might be futher improved using a system profiler.
+ */
+
+#ifdef __arm__
+#include <machine/cpu-features.h>
+#endif
+
+#if __ARM_ARCH__ >= 6
+
+// void armv6_idct(short *coefs, int *quans, unsigned char *rows, int col)
+ .arm
+ .text
+ .align
+ .global armv6_idct
+ .func armv6_idct
+
+armv6_idct:
+ // Push everything except sp (r13) and pc (r15).
+ stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+
+ // r12 = quans, r14 = coefs.
+ sub r4, sp, #236
+ bic sp, r4, #31
+ add r5, sp, #224
+ add r12, r1, #256
+ stm r5, {r2, r3, r4}
+ add r14, r0, #16
+
+pass1_head:
+ // Load quantization values. (q[0, 2, 4, 6])
+ ldmdb r12!, {r8, r9, r10, r11}
+
+ // Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7])
+ ldrsh r4, [r14, #-2] !
+ ldrsh r1, [r14, #16]
+ ldrsh r2, [r14, #32]
+ ldrsh r3, [r14, #48]
+ ldrsh r0, [r14, #64]
+ ldrsh r5, [r14, #80]
+ ldrsh r6, [r14, #96]
+ ldrsh r7, [r14, #112]
+
+ // r4 = q[0] * c[0];
+ mul r4, r8, r4
+
+ // Check if ACs are all zero.
+ cmp r0, #0
+ orreqs r8, r1, r2
+ orreqs r8, r3, r5
+ orreqs r8, r6, r7
+ beq pass1_zero
+
+ // Step 1: Dequantizations.
+
+ // r2 = q[2] * c[2];
+ // r0 = q[4] * c[4] + r4;
+ // r6 = q[6] * c[6] + r2;
+ mul r2, r9, r2
+ mla r0, r10, r0, r4
+ mla r6, r11, r6, r2
+
+ // Load quantization values. (q[7, 5, 1, 3])
+ ldmdb r12!, {r8, r9, r10, r11}
+
+ // r4 = r4 * 2 - r0 = -(r0 - r4 * 2);
+ // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+ rsb r4, r0, r4, lsl #1
+ rsb r2, r6, r2, lsl #1
+
+ // r7 = q[7] * c[7];
+ // r5 = q[5] * c[5];
+ // r1 = q[1] * c[1] + r7;
+ // r3 = q[3] * c[3] + r5;
+ mul r7, r8, r7
+ mul r5, r9, r5
+ mla r1, r10, r1, r7
+ mla r3, r11, r3, r5
+
+ // Load constants.
+ ldrd r10, constants
+
+ // Step 2: Rotations and Butterflies.
+
+ // r7 = r1 - r7 * 2;
+ // r1 = r1 - r3;
+ // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+ // r3 = r1 + r3 * 2;
+ // r8 = r5 + r7;
+ sub r7, r1, r7, lsl #1
+ sub r1, r1, r3
+ rsb r5, r3, r5, lsl #1
+ add r3, r1, r3, lsl #1
+ add r8, r5, r7
+
+ // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+ // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+ // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+ smlawt r2, r2, r10, r2
+ smulwb r8, r8, r10
+ smlawt r1, r1, r10, r1
+
+ // r0 = r0 + r6;
+ // r2 = r2 - r6;
+ // r6 = r0 - r6 * 2;
+ add r0, r0, r6
+ sub r2, r2, r6
+ sub r6, r0, r6, lsl #1
+
+ // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+ // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+ smlawt r5, r5, r11, r8
+ smlawb r8, r7, r11, r8
+
+ // r4 = r4 + r2;
+ // r0 = r0 + r3;
+ // r2 = r4 - r2 * 2;
+ add r4, r4, r2
+ add r0, r0, r3
+ sub r2, r4, r2, lsl #1
+
+ // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+ // r3 = r0 - r3 * 2;
+ // r1 = r1 - r7;
+ // r4 = r4 + r7;
+ // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+ // r7 = r4 - r7 * 2;
+ rsb r7, r3, r5, lsl #3
+ sub r3, r0, r3, lsl #1
+ sub r1, r1, r7
+ add r4, r4, r7
+ rsb r5, r1, r8, lsl #3
+ sub r7, r4, r7, lsl #1
+
+ // r2 = r2 + r1;
+ // r6 = r6 + r5;
+ // r1 = r2 - r1 * 2;
+ // r5 = r6 - r5 * 2;
+ add r2, r2, r1
+ add r6, r6, r5
+ sub r1, r2, r1, lsl #1
+ sub r5, r6, r5, lsl #1
+
+ // Step 3: Reorder and Save.
+
+ str r0, [sp, #-4] !
+ str r4, [sp, #32]
+ str r2, [sp, #64]
+ str r6, [sp, #96]
+ str r5, [sp, #128]
+ str r1, [sp, #160]
+ str r7, [sp, #192]
+ str r3, [sp, #224]
+ b pass1_tail
+
+ // Precomputed 16-bit constants: 27146, 15137, -21407, -8867.
+ // Put them in the middle since LDRD only accepts offsets from -255 to 255.
+ .align 3
+constants:
+ .word 0x6a0a3b21
+ .word 0xac61dd5d
+
+pass1_zero:
+ str r4, [sp, #-4] !
+ str r4, [sp, #32]
+ str r4, [sp, #64]
+ str r4, [sp, #96]
+ str r4, [sp, #128]
+ str r4, [sp, #160]
+ str r4, [sp, #192]
+ str r4, [sp, #224]
+ sub r12, r12, #16
+
+pass1_tail:
+ ands r9, sp, #31
+ bne pass1_head
+
+ // r12 = rows, r14 = col.
+ ldr r12, [sp, #256]
+ ldr r14, [sp, #260]
+
+ // Load constants.
+ ldrd r10, constants
+
+pass2_head:
+ // Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7])
+ ldmia sp!, {r0, r1, r2, r3, r4, r5, r6, r7}
+
+ // r0 = r0 + 0x00808000;
+ add r0, r0, #0x00800000
+ add r0, r0, #0x00008000
+
+ // Step 1: Analog to the first pass.
+
+ // r0 = r0 + r4;
+ // r6 = r6 + r2;
+ add r0, r0, r4
+ add r6, r6, r2
+
+ // r4 = r0 - r4 * 2;
+ // r2 = r2 * 2 - r6 = -(r6 - r2 * 2);
+ sub r4, r0, r4, lsl #1
+ rsb r2, r6, r2, lsl #1
+
+ // r1 = r1 + r7;
+ // r3 = r3 + r5;
+ add r1, r1, r7
+ add r3, r3, r5
+
+ // Step 2: Rotations and Butterflies.
+
+ // r7 = r1 - r7 * 2;
+ // r1 = r1 - r3;
+ // r5 = r5 * 2 - r3 = -(r3 - r5 * 2);
+ // r3 = r1 + r3 * 2;
+ // r8 = r5 + r7;
+ sub r7, r1, r7, lsl #1
+ sub r1, r1, r3
+ rsb r5, r3, r5, lsl #1
+ add r3, r1, r3, lsl #1
+ add r8, r5, r7
+
+ // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2;
+ // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536;
+ // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1;
+ smlawt r2, r2, r10, r2
+ smulwb r8, r8, r10
+ smlawt r1, r1, r10, r1
+
+ // r0 = r0 + r6;
+ // r2 = r2 - r6;
+ // r6 = r0 - r6 * 2;
+ add r0, r0, r6
+ sub r2, r2, r6
+ sub r6, r0, r6, lsl #1
+
+ // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8;
+ // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8;
+ smlawt r5, r5, r11, r8
+ smlawb r8, r7, r11, r8
+
+ // r4 = r4 + r2;
+ // r0 = r0 + r3;
+ // r2 = r4 - r2 * 2;
+ add r4, r4, r2
+ add r0, r0, r3
+ sub r2, r4, r2, lsl #1
+
+ // r7 = r5 * 8 - r3 = -(r3 - r5 * 8);
+ // r3 = r0 - r3 * 2;
+ // r1 = r1 - r7;
+ // r4 = r4 + r7;
+ // r5 = r8 * 8 - r1 = -(r1 - r8 * 8);
+ // r7 = r4 - r7 * 2;
+ rsb r7, r3, r5, lsl #3
+ sub r3, r0, r3, lsl #1
+ sub r1, r1, r7
+ add r4, r4, r7
+ rsb r5, r1, r8, lsl #3
+ sub r7, r4, r7, lsl #1
+
+ // r2 = r2 + r1;
+ // r6 = r6 + r5;
+ // r1 = r2 - r1 * 2;
+ // r5 = r6 - r5 * 2;
+ add r2, r2, r1
+ add r6, r6, r5
+ sub r1, r2, r1, lsl #1
+ sub r5, r6, r5, lsl #1
+
+ // Step 3: Reorder and Save.
+
+ // Load output pointer.
+ ldr r8, [r12], #4
+
+ // For little endian: r6, r2, r4, r0, r3, r7, r1, r5.
+ pkhtb r6, r6, r4, asr #16
+ pkhtb r2, r2, r0, asr #16
+ pkhtb r3, r3, r1, asr #16
+ pkhtb r7, r7, r5, asr #16
+ usat16 r6, #8, r6
+ usat16 r2, #8, r2
+ usat16 r3, #8, r3
+ usat16 r7, #8, r7
+ orr r0, r2, r6, lsl #8
+ orr r1, r7, r3, lsl #8
+
+#ifdef __ARMEB__
+ // Reverse bytes for big endian.
+ rev r0, r0
+ rev r1, r1
+#endif
+
+ // Use STR instead of STRD to support unaligned access.
+ str r0, [r8, r14] !
+ str r1, [r8, #4]
+
+pass2_tail:
+ adds r9, r9, #0x10000000
+ bpl pass2_head
+
+ ldr sp, [sp, #8]
+ add sp, sp, #236
+
+ ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+ bx lr
+ .endfunc
+
+#endif
diff --git a/jdcoefct.c b/jdcoefct.c
index 9e8040b..e6e9506 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -277,15 +277,24 @@ consume_data (j_decompress_ptr cinfo)
unsigned int MCUs_per_row = cinfo->MCUs_per_row;
#ifdef ANDROID_TILE_BASED_DECODE
if (cinfo->tile_decode) {
+ int iMCU_width_To_MCU_width;
+ if (cinfo->comps_in_scan > 1) {
+ // Interleaved
+ iMCU_width_To_MCU_width = 1;
+ } else {
+ // Non-intervleaved
+ iMCU_width_To_MCU_width = cinfo->cur_comp_info[0]->h_samp_factor;
+ }
MCUs_per_row = jmin(MCUs_per_row,
(cinfo->coef->column_right_boundary - cinfo->coef->column_left_boundary)
- * cinfo->entropy->index->MCU_sample_size * cinfo->max_h_samp_factor);
+ * cinfo->entropy->index->MCU_sample_size * iMCU_width_To_MCU_width);
}
#endif
/* Loop to process one whole iMCU row */
for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
yoffset++) {
+ // configure huffman decoder
#ifdef ANDROID_TILE_BASED_DECODE
if (cinfo->tile_decode) {
huffman_scan_header scan_header =
@@ -296,8 +305,10 @@ consume_data (j_decompress_ptr cinfo)
[col_offset + yoffset * scan_header.MCUs_per_row]);
}
#endif
+
+ // zero all blocks
for (MCU_col_num = coef->MCU_ctr; MCU_col_num < MCUs_per_row;
- MCU_col_num++) {
+ MCU_col_num++) {
/* Construct list of pointers to DCT blocks belonging to this MCU */
blkn = 0; /* index of current DCT block within MCU */
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
@@ -309,7 +320,7 @@ consume_data (j_decompress_ptr cinfo)
coef->MCU_buffer[blkn++] = buffer_ptr++;
#ifdef ANDROID_TILE_BASED_DECODE
if (cinfo->tile_decode && cinfo->input_scan_number == 0) {
- // need to do pre-zero ourself.
+ // need to do pre-zero ourselves.
jzero_far((void FAR *) coef->MCU_buffer[blkn-1],
(size_t) (SIZEOF(JBLOCK)));
}
@@ -317,12 +328,14 @@ consume_data (j_decompress_ptr cinfo)
}
}
}
+
+
/* Try to fetch the MCU. */
if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
- /* Suspension forced; update state counters and exit */
- coef->MCU_vert_offset = yoffset;
- coef->MCU_ctr = MCU_col_num;
- return JPEG_SUSPENDED;
+ /* Suspension forced; update state counters and exit */
+ coef->MCU_vert_offset = yoffset;
+ coef->MCU_ctr = MCU_col_num;
+ return JPEG_SUSPENDED;
}
}
/* Completed an MCU row, but perhaps not an iMCU row */
@@ -584,14 +597,14 @@ decompress_data (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
int start_block = 0;
#if ANDROID_TILE_BASED_DECODE
if (cinfo->tile_decode) {
+ // width_in_blocks for a component depends on its h_samp_factor.
width_in_blocks = jmin(width_in_blocks,
(cinfo->coef->MCU_column_right_boundary -
cinfo->coef->MCU_column_left_boundary) *
- cinfo->max_h_samp_factor /
compptr->h_samp_factor);
start_block = coef->pub.MCU_columns_to_skip *
- cinfo->max_h_samp_factor / compptr->h_samp_factor;
- }
+ compptr->h_samp_factor;
+ }
#endif
/* Loop over all DCT blocks to be processed. */
for (block_row = 0; block_row < block_rows; block_row++) {
diff --git a/jddctmgr.c b/jddctmgr.c
index bbf8d0e..74a96db 100644
--- a/jddctmgr.c
+++ b/jddctmgr.c
@@ -20,6 +20,35 @@
#include "jpeglib.h"
#include "jdct.h" /* Private declarations for DCT subsystem */
+#ifdef ANDROID_ARMV6_IDCT
+ #undef ANDROID_ARMV6_IDCT
+ #ifdef __arm__
+ #include <machine/cpu-features.h>
+ #if __ARM_ARCH__ >= 6
+ #define ANDROID_ARMV6_IDCT
+ #else
+ #warning "ANDROID_ARMV6_IDCT is disabled"
+ #endif
+ #endif
+#endif
+
+#ifdef ANDROID_ARMV6_IDCT
+
+/* Intentionally declare the prototype with arguments of primitive types instead
+ * of type-defined ones. This will at least generate some warnings if jmorecfg.h
+ * is changed and becomes incompatible with the assembly code.
+ */
+extern void armv6_idct(short *coefs, int *quans, unsigned char **rows, int col);
+
+void jpeg_idct_armv6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
+ JCOEFPTR coef_block,
+ JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+ IFAST_MULT_TYPE *dct_table = (IFAST_MULT_TYPE *)compptr->dct_table;
+ armv6_idct(coef_block, dct_table, output_buf, output_col);
+}
+
+#endif
/*
* The decompressor input side (jdinput.c) saves away the appropriate
@@ -115,6 +144,13 @@ start_pass (j_decompress_ptr cinfo)
#endif
case DCTSIZE:
switch (cinfo->dct_method) {
+#ifdef ANDROID_ARMV6_IDCT
+ case JDCT_ISLOW:
+ case JDCT_IFAST:
+ method_ptr = jpeg_idct_armv6;
+ method = JDCT_IFAST;
+ break;
+#else /* ANDROID_ARMV6_IDCT */
#ifdef DCT_ISLOW_SUPPORTED
case JDCT_ISLOW:
method_ptr = jpeg_idct_islow;
@@ -127,6 +163,7 @@ start_pass (j_decompress_ptr cinfo)
method = JDCT_IFAST;
break;
#endif
+#endif /* ANDROID_ARMV6_IDCT */
#ifdef DCT_FLOAT_SUPPORTED
case JDCT_FLOAT:
method_ptr = jpeg_idct_float;
@@ -181,6 +218,27 @@ start_pass (j_decompress_ptr cinfo)
* IFAST_SCALE_BITS.
*/
IFAST_MULT_TYPE * ifmtbl = (IFAST_MULT_TYPE *) compptr->dct_table;
+#ifdef ANDROID_ARMV6_IDCT
+ /* Precomputed values scaled up by 15 bits. */
+ static const unsigned short scales[DCTSIZE2] = {
+ 32768, 45451, 42813, 38531, 32768, 25746, 17734, 9041,
+ 45451, 63042, 59384, 53444, 45451, 35710, 24598, 12540,
+ 42813, 59384, 55938, 50343, 42813, 33638, 23170, 11812,
+ 38531, 53444, 50343, 45308, 38531, 30274, 20853, 10631,
+ 32768, 45451, 42813, 38531, 32768, 25746, 17734, 9041,
+ 25746, 35710, 33638, 30274, 25746, 20228, 13933, 7103,
+ 17734, 24598, 23170, 20853, 17734, 13933, 9598, 4893,
+ 9041, 12540, 11812, 10631, 9041, 7103, 4893, 2494,
+ };
+ /* Inverse map of [7, 5, 1, 3, 0, 2, 4, 6]. */
+ static const char orders[DCTSIZE] = {4, 2, 5, 3, 6, 1, 7, 0};
+ /* Reorder the columns after transposing. */
+ for (i = 0; i < DCTSIZE2; ++i) {
+ int j = ((i & 7) << 3) + orders[i >> 3];
+ ifmtbl[j] = (qtbl->quantval[i] * scales[i] + 2) >> 2;
+ }
+#else /* ANDROID_ARMV6_IDCT */
+
#define CONST_BITS 14
static const INT16 aanscales[DCTSIZE2] = {
/* precomputed values scaled up by 14 bits */
@@ -201,6 +259,7 @@ start_pass (j_decompress_ptr cinfo)
(INT32) aanscales[i]),
CONST_BITS-IFAST_SCALE_BITS);
}
+#endif /* ANDROID_ARMV6_IDCT */
}
break;
#endif
diff --git a/jdhuff.c b/jdhuff.c
index 0d704a5..bc5d4fd 100644
--- a/jdhuff.c
+++ b/jdhuff.c
@@ -295,20 +295,10 @@ jpeg_fill_bit_buffer (bitread_working_state * state,
int nbits)
/* Load up the bit buffer to a depth of at least nbits */
{
- j_decompress_ptr cinfo = state->cinfo;
- if (cinfo->tile_decode &&
- cinfo->restart_interval == 0 &&
- cinfo->unread_marker >= 0xd0 &&
- cinfo->unread_marker <= 0xd7 &&
- nbits > bits_left
- ) {
- // Skip the restart marker.
- cinfo->marker->next_restart_num = cinfo->unread_marker - 0xd0;
- process_restart(cinfo);
- }
/* Copy heavily used state fields into locals (hopefully registers) */
register const JOCTET * next_input_byte = state->next_input_byte;
register size_t bytes_in_buffer = state->bytes_in_buffer;
+ j_decompress_ptr cinfo = state->cinfo;
/* Attempt to load at least MIN_GET_BITS bits into get_buffer. */
/* (It is assumed that no request will be for more than that many bits.) */
@@ -509,24 +499,20 @@ process_restart (j_decompress_ptr cinfo)
}
/*
- * Configure the Huffman decoder reader position and bit buffer.
+ * Save the current Huffman deocde position and the DC coefficients
+ * for each component into bitstream_offset and dc_info[], respectively.
*/
-GLOBAL(void)
-jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
- huffman_offset_data offset)
+METHODDEF(void)
+get_huffman_decoder_configuration(j_decompress_ptr cinfo,
+ huffman_offset_data *offset)
{
- unsigned int bitstream_offset = offset.bitstream_offset;
- int blkn, i;
-
- cinfo->restart_interval = 0;
- cinfo->unread_marker = 0;
-
- unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
- unsigned int bit_in_bit_buffer =
- bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
-
- jset_input_stream_position_bit(cinfo, byte_offset,
- bit_in_bit_buffer, offset.get_buffer);
+ huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
+ short int *dc_info = offset->prev_dc;
+ int i;
+ jpeg_get_huffman_decoder_configuration(cinfo, offset);
+ for (i = 0; i < cinfo->comps_in_scan; i++) {
+ dc_info[i] = entropy->saved.last_dc_val[i];
+ }
}
/*
@@ -546,6 +532,10 @@ jpeg_get_huffman_decoder_configuration(j_decompress_ptr cinfo,
return;
}
+ // Save restarts_to_go and next_restart_num
+ offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+ offset->next_restart_num = cinfo->marker->next_restart_num;
+
offset->bitstream_offset =
(jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
+ entropy->bitstate.bits_left;
@@ -570,20 +560,28 @@ configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
}
/*
- * Save the current Huffman deocde position and the DC coefficients
- * for each component into bitstream_offset and dc_info[], respectively.
+ * Configure the Huffman decoder reader position and bit buffer.
*/
-METHODDEF(void)
-get_huffman_decoder_configuration(j_decompress_ptr cinfo,
- huffman_offset_data *offset)
+GLOBAL(void)
+jpeg_configure_huffman_decoder(j_decompress_ptr cinfo,
+ huffman_offset_data offset)
{
huff_entropy_ptr entropy = (huff_entropy_ptr) cinfo->entropy;
- short int *dc_info = offset->prev_dc;
- int i;
- jpeg_get_huffman_decoder_configuration(cinfo, offset);
- for (i = 0; i < cinfo->comps_in_scan; i++) {
- dc_info[i] = entropy->saved.last_dc_val[i];
- }
+
+ // Restore restarts_to_go and next_restart_num
+ cinfo->unread_marker = 0;
+ entropy->restarts_to_go = offset.restarts_to_go;
+ cinfo->marker->next_restart_num = offset.next_restart_num;
+
+ unsigned int bitstream_offset = offset.bitstream_offset;
+ int blkn, i;
+
+ unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+ unsigned int bit_in_bit_buffer =
+ bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+ jset_input_stream_position_bit(cinfo, byte_offset,
+ bit_in_bit_buffer, offset.get_buffer);
}
/*
diff --git a/jdphuff.c b/jdphuff.c
index a58cdd2..922017e 100644
--- a/jdphuff.c
+++ b/jdphuff.c
@@ -632,6 +632,52 @@ undoit:
}
/*
+ * Save the current Huffman deocde position and the DC coefficients
+ * for each component into bitstream_offset and dc_info[], respectively.
+ */
+METHODDEF(void)
+get_huffman_decoder_configuration(j_decompress_ptr cinfo,
+ huffman_offset_data *offset)
+{
+ int i;
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+ jpeg_get_huffman_decoder_configuration_progressive(cinfo, offset);
+ offset->EOBRUN = entropy->saved.EOBRUN;
+ for (i = 0; i < cinfo->comps_in_scan; i++)
+ offset->prev_dc[i] = entropy->saved.last_dc_val[i];
+}
+
+
+/*
+ * Save the current Huffman decoder position and the bit buffer
+ * into bitstream_offset and get_buffer, respectively.
+ */
+GLOBAL(void)
+jpeg_get_huffman_decoder_configuration_progressive(j_decompress_ptr cinfo,
+ huffman_offset_data *offset)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+ if (cinfo->restart_interval) {
+ // We are at the end of a data segment
+ if (entropy->restarts_to_go == 0)
+ if (! process_restart(cinfo))
+ return;
+ }
+
+ // Save restarts_to_go and next_restart_num.
+ offset->restarts_to_go = (unsigned short) entropy->restarts_to_go;
+ offset->next_restart_num = cinfo->marker->next_restart_num;
+
+ offset->bitstream_offset =
+ (jget_input_stream_position(cinfo) << LOG_TWO_BIT_BUF_SIZE)
+ + entropy->bitstate.bits_left;
+
+ offset->get_buffer = entropy->bitstate.get_buffer;
+}
+
+
+/*
* Configure the Huffman decoder to decode the image
* starting from (iMCU_row_offset, iMCU_col_offset).
*/
@@ -640,32 +686,58 @@ configure_huffman_decoder(j_decompress_ptr cinfo, huffman_offset_data offset)
{
int i;
phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- jpeg_configure_huffman_decoder(cinfo, offset);
+ jpeg_configure_huffman_decoder_progressive(cinfo, offset);
entropy->saved.EOBRUN = offset.EOBRUN;
for (i = 0; i < cinfo->comps_in_scan; i++)
entropy->saved.last_dc_val[i] = offset.prev_dc[i];
}
/*
- * Save the current Huffman deocde position and the DC coefficients
- * for each component into bitstream_offset and dc_info[], respectively.
+ * Configure the Huffman decoder reader position and bit buffer.
*/
-METHODDEF(void)
-get_huffman_decoder_configuration(j_decompress_ptr cinfo,
- huffman_offset_data *offset)
+GLOBAL(void)
+jpeg_configure_huffman_decoder_progressive(j_decompress_ptr cinfo,
+ huffman_offset_data offset)
+{
+ phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
+
+ // Restore restarts_to_go and next_restart_num
+ cinfo->unread_marker = 0;
+ entropy->restarts_to_go = offset.restarts_to_go;
+ cinfo->marker->next_restart_num = offset.next_restart_num;
+
+ unsigned int bitstream_offset = offset.bitstream_offset;
+ int blkn, i;
+
+ unsigned int byte_offset = bitstream_offset >> LOG_TWO_BIT_BUF_SIZE;
+ unsigned int bit_in_bit_buffer =
+ bitstream_offset & ((1 << LOG_TWO_BIT_BUF_SIZE) - 1);
+
+ jset_input_stream_position_bit(cinfo, byte_offset,
+ bit_in_bit_buffer, offset.get_buffer);
+}
+
+GLOBAL(void)
+jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
+ huffman_index *index, int scan_no, int offset)
{
- int i;
phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- jpeg_get_huffman_decoder_configuration(cinfo, offset);
- offset->EOBRUN = entropy->saved.EOBRUN;
- for (i = 0; i < cinfo->comps_in_scan; i++)
- offset->prev_dc[i] = entropy->saved.last_dc_val[i];
+ if (scan_no >= index->scan_count) {
+ index->scan = realloc(index->scan,
+ (scan_no + 1) * sizeof(huffman_scan_header));
+ index->mem_used += (scan_no - index->scan_count + 1)
+ * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
+ * sizeof(huffman_offset_data*));
+ index->scan_count = scan_no + 1;
+ }
+ index->scan[scan_no].offset = (huffman_offset_data**)malloc(
+ cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
+ index->scan[scan_no].bitstream_offset = offset;
}
/*
* Module initialization routine for progressive Huffman entropy decoding.
*/
-
GLOBAL(void)
jinit_phuff_decoder (j_decompress_ptr cinfo)
{
@@ -697,22 +769,4 @@ jinit_phuff_decoder (j_decompress_ptr cinfo)
*coef_bit_ptr++ = -1;
}
-GLOBAL(void)
-jpeg_configure_huffman_index_scan(j_decompress_ptr cinfo,
- huffman_index *index, int scan_no, int offset)
-{
- phuff_entropy_ptr entropy = (phuff_entropy_ptr) cinfo->entropy;
- if (scan_no >= index->scan_count) {
- index->scan = realloc(index->scan,
- (scan_no + 1) * sizeof(huffman_scan_header));
- index->mem_used += (scan_no - index->scan_count + 1)
- * (sizeof(huffman_scan_header) + cinfo->total_iMCU_rows
- * sizeof(huffman_offset_data*));
- index->scan_count = scan_no + 1;
- }
- index->scan[scan_no].offset = (huffman_offset_data**)malloc(
- cinfo->total_iMCU_rows * sizeof(huffman_offset_data*));
- index->scan[scan_no].bitstream_offset = offset;
-}
-
#endif /* D_PROGRESSIVE_SUPPORTED */
diff --git a/jidctfst.S b/jidctfst.S
deleted file mode 100644
index 34e1c24..0000000
--- a/jidctfst.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <machine/cpu-features.h>
-
- .text
- .align
-
- .global jpeg_idct_ifast
- .func jpeg_idct_ifast
-
-// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
-
-// jpeg_idct_ifast (j_decompress_ptr cinfo,
-// jpeg_component_info * compptr,
-// short* coef_block,
-// unsigned char* output_buf,
-// int output_col)
-
-#define local_TMP0123 sp
-#define local_TMP0 [sp, #0]
-#define local_TMP1 [sp, #4]
-#define local_TMP2 [sp, #8]
-#define local_TMP3 [sp, #12]
-#define local_RANGE_TABLE [sp, #16]
-#define local_OUTPUT_COL [sp, #20]
-#define local_OUTPUT_BUF [sp, #24]
-#define local_UNUSED [sp, #28]
-#define off_WORKSPACE 32
-#define local_WORKSPACE [sp, #offWORKSPACE]
-#define local_SIZE (off_WORKSPACE + 8*8*4)
-
-#define off_DECOMPRESS_range_limit_base 324
-#define off_COMPINFO_quanttable 80
-
-#define DCTSIZE 8
-#define VY(x) ((x)*DCTSIZE*2)
-#define QY(x) ((x)*DCTSIZE*4)
-
-#define VX(x) ((x)*2)
-#define QX(x) ((x)*4)
-
-#define FIX_1_414213562 #362
-#define FIX_1_082392200 #277
-#define FIX_1_847759065 #473
-#define FIX_2_613125930 #669
-
-#define RANGE_MASK 1023
-
-
-
-jpeg_idct_ifast:
- PLD (r2, #0)
- stmdb sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
- ldr r4, [sp, #4*10]
- sub sp, #local_SIZE
-
- ldr r10,[r1, #off_COMPINFO_quanttable] // r10 = quanttable
- str r4, local_OUTPUT_COL
- str r3, local_OUTPUT_BUF
- ldr r5, [r0, #off_DECOMPRESS_range_limit_base]
- add r5, r5, #128
- str r5, local_RANGE_TABLE
- mov fp, r2 // fp = coef_block
- add ip, sp, #off_WORKSPACE
-
-VLoopTail:
- ldrsh r0, [fp, #VY(0)]
- ldrsh r1, [fp, #VY(1)]
- ldrsh r2, [fp, #VY(2)]
- ldrsh r3, [fp, #VY(3)]
- ldrsh r4, [fp, #VY(4)]
- ldrsh r5, [fp, #VY(5)]
- ldrsh r6, [fp, #VY(6)]
- ldrsh r7, [fp, #VY(7)]
-
- cmp r1, #0
- orreqs r8, r2, r3
- orreqs r8, r4, r5
- orreqs r8, r6, r7
- beq VLoopHeadZero
-
-VLoopHead:
- // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0] (r0)
- // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4] (r4)
- // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2] (r2)
- // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6] (r6)
- // tmp10 = tmp0 + tmp2 (r0)
- // tmp11 = tmp0 - tmp2 (r4)
-
- ldr r9, [r10, #QY(4)]
- ldr r8, [r10, #QY(0)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r4, r9, r4
- smlabb r0, r8, r0, r4
-#else
- mul r4, r9, r4
- mul r0, r8, r0
- add r0, r4
-#endif
- ldr r9, [r10, #QY(6)]
- ldr r8, [r10, #QY(2)]
- sub r4, r0, r4, lsl #1
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r6, r9, r6
- smlabb r2, r8, r2, r6
-#else
- mul r6, r9, r6
- mul r2, r8, r2
- add r2, r6
-#endif
-
- // tmp13 = tmp1 + tmp3 (r2)
- // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13 (r6)
- // FIX_1_4142... = 362 = 45*8 + 2
- sub r6, r2, r6, lsl #1
- mov r8, #360
- add r8, r8, #2
- mul r9, r6, r8
-
- // tmp0 = tmp10 + tmp13; (r0)
- // tmp3 = tmp10 - tmp13; (r8)
- // tmp1 = tmp11 + tmp12; (r4)
- // tmp2 = tmp11 - tmp12; (r6)
- add r0, r0, r2
- rsb r6, r2, r9, asr #8
- sub r8, r0, r2, lsl #1
- add r4, r4, r6
- sub r6, r4, r6, lsl #1
-
- stmia local_TMP0123, {r0, r4, r6, r8}
-
- // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
-
- // odd part
- // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] ) (r1)
- // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] ) (r5)
- // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] ) (r3)
- // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] ) (r7)
- // z13 = tmp6 + tmp5; (r0)
- // z10 = tmp6 - tmp5; (r2)
- // z11 = tmp4 + tmp7; (r4)
- // z12 = tmp4 - tmp7; (r6)
-
- ldr r2, [r10, #QY(1)]
- ldr r9, [r10, #QY(5)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r1, r2, r1
-#else
- mul r1, r2, r1
-#endif
- ldr r2, [r10, #QY(3)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smulbb r5, r9, r5
-#else
- mul r5, r9, r5
-#endif
- ldr r9, [r10, #QY(7)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
- smlabb r0, r2, r3, r5
- smlabb r4, r9, r7, r1
-#else
- mul r0, r2, r3
- add r0, r5
- mul r4, r9, r7
- add r4, r1
-#endif
- rsb r2, r0, r5, lsl #1
- rsb r6, r4, r1, lsl #1
-
- // tmp7 = z11 + z13; (r7)
- // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
- // FIX_... = 360 + 2
- add r7, r4, r0
- sub r1, r4, r0
- mov r8, #360
- add r8, r8, #2
- mul r1, r8, r1
-
- // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
- // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
- // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
- // FIX_1_8477... = 473 = 472 + 1
- // FIX_1_082... = 277 = 276 + 1
- // FIX_2_... = 669 = 668 + 1
- add r8, r2, r6
- mov r9, #472
- mla r8, r9, r8, r8
- mov r9, #276
- mla r0, r6, r9, r6
- mov r9, #668
- mla r2, r9, r2, r2
- sub r0, r0, r8
- rsb r2, r2, r8
-
- // tmp6 = tmp12 - tmp7; (r6)
- // tmp5 = tmp11 - tmp6; (r5)
- // tmp4 = tmp10 + tmp5; (r4)
- rsb r6, r7, r2, asr #8
- rsb r5, r6, r1, asr #8
- add r4, r5, r0, asr #8
-
- ldmia local_TMP0123, {r0, r1, r2, r3}
-
- // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
- // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
- // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
- // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
- // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
- // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
- // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
- // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
- add r0, r0, r7
- sub r7, r0, r7, lsl #1
- add r1, r1, r6
- sub r6, r1, r6, lsl #1
- add r2, r2, r5
- sub r5, r2, r5, lsl #1
- sub r3, r3, r4
- add r4, r3, r4, lsl #1
-
- str r0, [ip, #QY(0)]
- str r1, [ip, #QY(1)]
- str r2, [ip, #QY(2)]
- str r3, [ip, #QY(3)]
- str r4, [ip, #QY(4)]
- str r5, [ip, #QY(5)]
- str r6, [ip, #QY(6)]
- str r7, [ip, #QY(7)]
-
- // inptr++; /* advance pointers to next column */
- // quantptr++;
- // wsptr++;
- add fp, fp, #2
- add r10, r10, #4
- add ip, ip, #4
- add r0, sp, #(off_WORKSPACE + 4*8)
- cmp ip, r0
- bne VLoopTail
-
-
-
-HLoopStart:
- // reset pointers
- PLD (sp, #off_WORKSPACE)
- add ip, sp, #off_WORKSPACE
- ldr r10, local_RANGE_TABLE
-
-HLoopTail:
- // output = *output_buf++ + output_col
- ldr r0, local_OUTPUT_BUF
- ldr r1, local_OUTPUT_COL
- ldr r2, [r0], #4
- str r0, local_OUTPUT_BUF
- add fp, r2, r1
-
- PLD (ip, #32)
- ldmia ip!, {r0-r7}
-
- cmp r1, #0
- orreqs r8, r2, r3
- orreqs r8, r4, r5
- orreqs r8, r6, r7
- beq HLoopTailZero
-
-HLoopHead:
- // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]); (r0)
- // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]); (r4)
- add r0, r0, r4
- sub r4, r0, r4, lsl #1
-
- // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]); (r2)
- // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13; (r6)
- // FIX_... = 360 + 2
- add r2, r2, r6
- sub r6, r2, r6, lsl #1
- mov r8, #360
- add r8, r8, #2
- mul r6, r8, r6
-
- // tmp0 = tmp10 + tmp13; (r0)
- // tmp3 = tmp10 - tmp13; (r8)
- // tmp1 = tmp11 + tmp12; (r4)
- // tmp2 = tmp11 - tmp12; (r6)
- add r0, r0, r2
- rsb r6, r2, r6, asr #8
- sub r8, r0, r2, lsl #1
- add r4, r4, r6
- sub r6, r4, r6, lsl #1
-
- stmia local_TMP0123, {r0, r4, r6, r8}
-
- // Odd part
-
- // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3]; (r0)
- // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3]; (r2)
- // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7]; (r4)
- // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7]; (r6)
- add r0, r5, r3
- sub r2, r5, r3
- add r4, r1, r7
- sub r6, r1, r7
-
- // tmp7 = z11 + z13; (r7)
- // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
- // FIX_... = 360 + 2
- add r7, r4, r0
- sub r1, r4, r0
- mov r8, #360
- add r8, r8, #2
- mul r1, r8, r1
-
- // z5 = MULTIPLY(z10 + z12, FIX_1_847759065); (r8)
- // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5; (r0)
- // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5; (r2)
- // FIX_1_8477... = 473 = 472 + 1
- // FIX_1_082... = 277 = 276 + 1
- // FIX_2_... = 669 = 668 + 1
- add r8, r2, r6
- mov r9, #472
- mla r8, r9, r8, r8
- mov r9, #276
- mla r0, r6, r9, r6
- mov r9, #668
- mla r2, r9, r2, r2
- sub r0, r0, r8
- sub r2, r8, r2
-
- // tmp6 = tmp12 - tmp7; (r6)
- // tmp5 = tmp11 - tmp6; (r5)
- // tmp4 = tmp10 + tmp5; (r4)
- rsb r6, r7, r2, asr #8
- rsb r5, r6, r1, asr #8
- add r4, r5, r0, asr #8
-
- ldmia local_TMP0123, {r0, r1, r2, r3}
-
- // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
- // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
- // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
- // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
- // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
- // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
- // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
- // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
-
- mov r8, #128
- add r0, r0, r7
- sub r7, r0, r7, lsl #1
- add r0, r8, r0, asr #5
- add r7, r8, r7, asr #5
- add r1, r1, r6
- sub r6, r1, r6, lsl #1
- add r1, r8, r1, asr #5
- add r6, r8, r6, asr #5
- add r2, r2, r5
- sub r5, r2, r5, lsl #1
- add r2, r8, r2, asr #5
- add r5, r8, r5, asr #5
- sub r3, r3, r4
- add r4, r3, r4, lsl #1
- add r3, r8, r3, asr #5
- add r4, r8, r4, asr #5
-
-#if __ARM_ARCH__ >= 6
- usat r0, #8, r0
- usat r1, #8, r1
- usat r2, #8, r2
- usat r3, #8, r3
- usat r4, #8, r4
- usat r5, #8, r5
- usat r6, #8, r6
- usat r7, #8, r7
-#else
- cmp r0, #255
- mvnhi r0, r0, asr #31
- andhi r0, #255
- cmp r7, #255
- mvnhi r7, r7, asr #31
- cmp r1, #255
- mvnhi r1, r1, asr #31
- andhi r1, #255
- cmp r6, #255
- mvnhi r6, r6, asr #31
- andhi r6, #255
- cmp r2, #255
- mvnhi r2, r2, asr #31
- andhi r2, #255
- cmp r5, #255
- mvnhi r5, r5, asr #31
- andhi r5, #255
- cmp r3, #255
- mvnhi r3, r3, asr #31
- cmp r4, #255
- mvnhi r4, r4, asr #31
- andhi r4, #255
-#endif
-
- // r3 r2 r1 r0
- orr r0, r0, r1, lsl #8
- orr r0, r0, r2, lsl #16
- orr r0, r0, r3, lsl #24
-
- // r7 r6 r5 r4
- orr r1, r4, r5, lsl #8
- orr r1, r1, r6, lsl #16
- orr r1, r1, r7, lsl #24
- stmia fp, {r0, r1}
-
- add r0, sp, #(off_WORKSPACE + 8*8*4)
- cmp ip, r0
- bne HLoopTail
-
-Exit:
- add sp, sp, #local_SIZE
- ldmia sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
- bx lr
-
-
-VLoopHeadZero:
-// ok, all AC coefficients are 0
- ldr r1, [r10, #QY(0)]
- add fp, fp, #2
- add r10, r10, #4
- mul r0, r1, r0
- str r0, [ip, #QY(0)]
- str r0, [ip, #QY(1)]
- str r0, [ip, #QY(2)]
- str r0, [ip, #QY(3)]
- str r0, [ip, #QY(4)]
- str r0, [ip, #QY(5)]
- str r0, [ip, #QY(6)]
- str r0, [ip, #QY(7)]
- add ip, ip, #4
- add r0, sp, #(off_WORKSPACE + 4*8)
- cmp ip, r0
- beq HLoopStart
- b VLoopTail
-
-HLoopTailZero:
- mov r0, r0, asr #5
- add r0, #128
-
-#if __ARM_ARCH__ >= 6
- usat r0, #8, r0
-#else
- cmp r0, #255
- mvnhi r0, r0, asr #31
- andhi r0, r0, #255
-#endif
-
- orr r0, r0, lsl #8
- orr r0, r0, lsl #16
- mov r1, r0
- stmia fp, {r0, r1}
-
- add r0, sp, #(off_WORKSPACE + 64*4)
- cmp ip, r0
- beq Exit
- b HLoopTail
-
- .endfunc
diff --git a/jpeglib.h b/jpeglib.h
index 83bed4a..07e6872 100644
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -649,6 +649,10 @@ typedef struct {
// save the decoder current bit buffer, entropy->bitstate.get_buffer.
INT32 get_buffer;
+
+ // save the restart info.
+ unsigned short restarts_to_go;
+ unsigned char next_restart_num;
} huffman_offset_data;
typedef struct {