diff options
-rw-r--r-- | Android.mk | 38 | ||||
-rw-r--r-- | jdct.h | 4 | ||||
-rw-r--r-- | jddctmgr.c | 14 | ||||
-rw-r--r-- | jmorecfg.h | 6 | ||||
-rw-r--r-- | mips_idct_le.S | 547 | ||||
-rw-r--r-- | mips_jidctfst.c | 208 |
6 files changed, 811 insertions, 6 deletions
@@ -36,11 +36,23 @@ LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays LOCAL_CFLAGS += -DANDROID_TILE_BASED_DECODE ifeq ($(TARGET_ARCH_VARIANT),x86-atom) -LOCAL_CFLAGS += -DANDROID_INTELSSE2_IDCT -LOCAL_SRC_FILES += jidctintelsse.c -else + LOCAL_CFLAGS += -DANDROID_INTELSSE2_IDCT + LOCAL_SRC_FILES += jidctintelsse.c +endif + # enable armv6 idct assembly -LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT +ifeq ($(strip $(TARGET_ARCH)),arm) + LOCAL_CFLAGS += -DANDROID_ARMV6_IDCT +endif + +# use mips assembler IDCT implementation if MIPS DSP-ASE is present +ifeq ($(strip $(TARGET_ARCH)),mips) + ifeq ($(strip $(ARCH_MIPS_HAS_DSP)),true) + LOCAL_CFLAGS += -DANDROID_MIPS_IDCT + LOCAL_SRC_FILES += \ + mips_jidctfst.c \ + mips_idct_le.S + endif endif LOCAL_MODULE := libjpeg_static @@ -67,3 +79,21 @@ LOCAL_SDK_VERSION := 17 endif include $(BUILD_SHARED_LIBRARY) + +include $(CLEAR_VARS) +LOCAL_ARM_MODE := arm +LOCAL_SRC_FILES := \ + cjpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h rdswitch.c cdjpeg.c rdtarga.c rdppm.c rdgif.c rdbmp.c +LOCAL_MODULE:= cjpeg +LOCAL_MODULE_TAGS := eng +LOCAL_SHARED_LIBRARIES := libc libcutils libjpeg +include $(BUILD_EXECUTABLE) + +include $(CLEAR_VARS) +LOCAL_ARM_MODE := arm +LOCAL_SRC_FILES := \ + djpeg.c cdjpeg.h jinclude.h jconfig.h jpeglib.h jmorecfg.h jerror.h cderror.h jversion.h cdjpeg.c wrppm.c wrgif.c wrbmp.c rdcolmap.c wrtarga.c +LOCAL_MODULE:= djpeg +LOCAL_MODULE_TAGS := eng +LOCAL_SHARED_LIBRARIES := libc libcutils libjpeg +include $(BUILD_EXECUTABLE) @@ -27,7 +27,11 @@ */ #if BITS_IN_JSAMPLE == 8 +#ifdef ANDROID_MIPS_IDCT +typedef short DCTELEM; /* 16 or 32 bits is fine */ +#else typedef int DCTELEM; /* 16 or 32 bits is fine */ +#endif #else typedef INT32 DCTELEM; /* must have 32 bits */ #endif @@ -57,6 +57,10 @@ extern void jpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * co JSAMPARRAY output_buf, JDIMENSION output_col); #endif +#ifdef ANDROID_MIPS_IDCT +extern void jpeg_idct_mips(j_decompress_ptr, jpeg_component_info *, JCOEFPTR, JSAMPARRAY, JDIMENSION); +#endif + /* * The decompressor input side (jdinput.c) saves away the appropriate * quantization table for each component at the start of the first scan @@ -164,7 +168,14 @@ start_pass (j_decompress_ptr cinfo) method_ptr = jpeg_idct_intelsse; method = JDCT_ISLOW; /* Use quant table of ISLOW.*/ break; -#else +#else /* ANDROID_INTELSSE2_IDCT */ +#ifdef ANDROID_MIPS_IDCT + case JDCT_ISLOW: + case JDCT_IFAST: + method_ptr = jpeg_idct_mips; + method = JDCT_IFAST; + break; +#else /* ANDROID_MIPS_IDCT */ #ifdef DCT_ISLOW_SUPPORTED case JDCT_ISLOW: method_ptr = jpeg_idct_islow; @@ -177,6 +188,7 @@ start_pass (j_decompress_ptr cinfo) method = JDCT_IFAST; break; #endif +#endif /* ANDROID_MIPS_IDCT */ #endif /* ANDROID_INTELSSE2_IDCT*/ #endif /* ANDROID_ARMV6_IDCT */ #ifdef DCT_FLOAT_SUPPORTED @@ -367,7 +367,11 @@ typedef int boolean; #ifdef ANDROID_INTELSSE2_IDCT #define MULTIPLIER short #else - #define MULTIPLIER int /* type for fastest integer multiply */ + #ifdef ANDROID_MIPS_IDCT + #define MULTIPLIER short + #else + #define MULTIPLIER int /* type for fastest integer multiply */ + #endif #endif #endif diff --git a/mips_idct_le.S b/mips_idct_le.S new file mode 100644 index 0000000..bdb6ffa --- /dev/null +++ b/mips_idct_le.S @@ -0,0 +1,547 @@ +# +# Copyright (C) 2011 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# IDCT implementation using the MIPS DSP ASE (little endian version) +# +# See MIPS Technologies Inc documents: +# "JPEG Decoder Optimization for MIPS32(R) Cores" MD00483 +# +# "MIPS32(R) Architecture for Programmers Volume IV-e: The MIPS(R) DSP +# Application Specifice Extension to the MIPS32(R) Architecture" MD00374 +# + + .set noreorder + .set nomacro + .set noat + +# This table has been moved to mips_jidctfst.c to avoid having to mess +# with the global pointer to make this code PIC. +# .rdata +# +# mips_idct_coefs: +# # Constant table of scaled IDCT coefficients. +# +# .word 0x45464546 # FIX( 1.082392200 / 2) = 17734 = 0x4546 +# .word 0x5A825A82 # FIX( 1.414213562 / 2) = 23170 = 0x5A82 +# .word 0x76427642 # FIX( 1.847759065 / 2) = 30274 = 0x7642 +# .word 0xAC61AC61 # FIX(-2.613125930 / 4) = -21407 = 0xAC61 + + .text + + .global mips_idct_columns + .ent mips_idct_columns + +# void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr, +# DCTELEM * wsptr, const int * mips_idct_coefs); + +mips_idct_columns: + +# $a0 - inptr +# $a1 - quantptr +# $a2 - wsptr +# $a3, $at - mips_idct_coefs +# $t0:7 - simd data +# $t8 - coefficients, temp +# $t9 - loop end address +# $s0:3 - simd quantization factors +# $s4:7 - temp results +# $v0:1 - temp results + + addiu $sp, $sp, -32 # reserve stack space for s0-s7 + + sw $s0, 28($sp) + sw $s1, 24($sp) + sw $s2, 20($sp) + sw $s3, 16($sp) + sw $s4, 12($sp) + sw $s5, 8($sp) + sw $s6, 4($sp) + sw $s7, 0($sp) + + addiu $t9, $a0, 16 # end address + + #lui $at, %hi(mips_idct_coefs) + #ori $at, %lo(mips_idct_coefs) + # move mips_idct_coefs address from $a3 into $at where the rest of this code expects it + or $at, $a3, $zero + +loop_columns: + + lw $s0, 0($a1) # quantptr[DCTSIZE*0] + + lw $t0, 0($a0) # inptr[DCTSIZE*0] + lw $t1, 16($a0) # inptr[DCTSIZE*1] + + muleq_s.w.phl $v0, $t0, $s0 # tmp0 ... + + lw $t2, 32($a0) # inptr[DCTSIZE*2] + lw $t3, 48($a0) # inptr[DCTSIZE*3] + lw $t4, 64($a0) # inptr[DCTSIZE*4] + lw $t5, 80($a0) # inptr[DCTSIZE*5] + + muleq_s.w.phr $t0, $t0, $s0 # ... tmp0 ... + + lw $t6, 96($a0) # inptr[DCTSIZE*6] + lw $t7, 112($a0) # inptr[DCTSIZE*7] + + or $s4, $t1, $t2 + or $s5, $t3, $t4 + + bnez $s4, full_column + ins $t0, $v0, 16, 16 # ... tmp0 + + bnez $s5, full_column + or $s6, $t5, $t6 + or $s6, $s6, $t7 + bnez $s6, full_column + + sw $t0, 0($a2) # wsptr[DCTSIZE*0] + sw $t0, 16($a2) # wsptr[DCTSIZE*1] + sw $t0, 32($a2) # wsptr[DCTSIZE*2] + sw $t0, 48($a2) # wsptr[DCTSIZE*3] + sw $t0, 64($a2) # wsptr[DCTSIZE*4] + sw $t0, 80($a2) # wsptr[DCTSIZE*5] + sw $t0, 96($a2) # wsptr[DCTSIZE*6] + sw $t0, 112($a2) # wsptr[DCTSIZE*7] + + addiu $a0, $a0, 4 + + b continue_columns + addiu $a1, $a1, 4 + + +full_column: + + lw $s1, 32($a1) # quantptr[DCTSIZE*2] + lw $s2, 64($a1) # quantptr[DCTSIZE*4] + + muleq_s.w.phl $v0, $t2, $s1 # tmp1 ... + muleq_s.w.phr $t2, $t2, $s1 # ... tmp1 ... + + lw $s0, 16($a1) # quantptr[DCTSIZE*1] + lw $s1, 48($a1) # quantptr[DCTSIZE*3] + lw $s3, 96($a1) # quantptr[DCTSIZE*6] + + muleq_s.w.phl $v1, $t4, $s2 # tmp2 ... + muleq_s.w.phr $t4, $t4, $s2 # ... tmp2 ... + + lw $s2, 80($a1) # quantptr[DCTSIZE*5] + lw $t8, 4($at) # FIX(1.414213562) + ins $t2, $v0, 16, 16 # ... tmp1 + + muleq_s.w.phl $v0, $t6, $s3 # tmp3 ... + muleq_s.w.phr $t6, $t6, $s3 # ... tmp3 ... + + ins $t4, $v1, 16, 16 # ... tmp2 + + addq.ph $s4, $t0, $t4 # tmp10 + subq.ph $s5, $t0, $t4 # tmp11 + + ins $t6, $v0, 16, 16 # ... tmp3 + + subq.ph $s6, $t2, $t6 # tmp12 ... + addq.ph $s7, $t2, $t6 # tmp13 + + mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... + + addq.ph $t0, $s4, $s7 # tmp0 + subq.ph $t6, $s4, $s7 # tmp3 + +################ + + muleq_s.w.phl $v0, $t1, $s0 # tmp4 ... + muleq_s.w.phr $t1, $t1, $s0 # ... tmp4 ... + + shll_s.ph $s6, $s6, 1 # x2 + + lw $s3, 112($a1) # quantptr[DCTSIZE*7] + + subq.ph $s6, $s6, $s7 # ... tmp12 + + muleq_s.w.phl $v1, $t7, $s3 # tmp7 ... + muleq_s.w.phr $t7, $t7, $s3 # ... tmp7 ... + + ins $t1, $v0, 16, 16 # ... tmp4 + + addq.ph $t2, $s5, $s6 # tmp1 + subq.ph $t4, $s5, $s6 # tmp2 + + muleq_s.w.phl $v0, $t5, $s2 # tmp6 ... + muleq_s.w.phr $t5, $t5, $s2 # ... tmp6 ... + + ins $t7, $v1, 16, 16 # ... tmp7 + + addq.ph $s5, $t1, $t7 # z11 + subq.ph $s6, $t1, $t7 # z12 + + muleq_s.w.phl $v1, $t3, $s1 # tmp5 ... + muleq_s.w.phr $t3, $t3, $s1 # ... tmp5 ... + + ins $t5, $v0, 16, 16 # ... tmp6 + +# stalls + + ins $t3, $v1, 16, 16 # ... tmp5 + + + addq.ph $s7, $t5, $t3 # z13 + subq.ph $v0, $t5, $t3 # z10 + + addq.ph $t7, $s5, $s7 # tmp7 + subq.ph $s5, $s5, $s7 # tmp11 ... + + addq.ph $v1, $v0, $s6 # z5 ... + + mulq_rs.ph $s5, $s5, $t8 # ... tmp11 + + lw $t8, 8($at) # FIX(1.847759065) + lw $s4, 0($at) # FIX(1.082392200) + + addq.ph $s0, $t0, $t7 + subq.ph $s1, $t0, $t7 + + mulq_rs.ph $v1, $v1, $t8 # ... z5 + + shll_s.ph $s5, $s5, 1 # x2 + + lw $t8, 12($at) # FIX(-2.613125930) + sw $s0, 0($a2) # wsptr[DCTSIZE*0] + + mulq_rs.ph $v0, $v0, $t8 # tmp12 ... + mulq_rs.ph $s4, $s6, $s4 # tmp10 ... + + shll_s.ph $v1, $v1, 1 # x2 + + addiu $a0, $a0, 4 + addiu $a1, $a1, 4 + + sw $s1, 112($a2) # wsptr[DCTSIZE*7] + + shll_s.ph $s6, $v0, 2 # x4 + shll_s.ph $s4, $s4, 1 # x2 + addq.ph $s6, $s6, $v1 # ... tmp12 + + subq.ph $t5, $s6, $t7 # tmp6 + subq.ph $s4, $s4, $v1 # ... tmp10 + subq.ph $t3, $s5, $t5 # tmp5 + addq.ph $s2, $t2, $t5 + addq.ph $t1, $s4, $t3 # tmp4 + subq.ph $s3, $t2, $t5 + + sw $s2, 16($a2) # wsptr[DCTSIZE*1] + sw $s3, 96($a2) # wsptr[DCTSIZE*6] + + addq.ph $v0, $t4, $t3 + subq.ph $v1, $t4, $t3 + + sw $v0, 32($a2) # wsptr[DCTSIZE*2] + sw $v1, 80($a2) # wsptr[DCTSIZE*5] + + addq.ph $v0, $t6, $t1 + subq.ph $v1, $t6, $t1 + + sw $v0, 64($a2) # wsptr[DCTSIZE*4] + sw $v1, 48($a2) # wsptr[DCTSIZE*3] + +continue_columns: + + bne $a0, $t9, loop_columns + addiu $a2, $a2, 4 + + + lw $s0, 28($sp) + lw $s1, 24($sp) + lw $s2, 20($sp) + lw $s3, 16($sp) + lw $s4, 12($sp) + lw $s5, 8($sp) + lw $s6, 4($sp) + lw $s7, 0($sp) + + jr $ra + addiu $sp, $sp, 32 + + + .end mips_idct_columns + + +################################################################## + + + .global mips_idct_rows + .ent mips_idct_rows + +# void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf, +# JDIMENSION output_col, const int * mips_idct_coefs); + +mips_idct_rows: + +# $a0 - wsptr +# $a1 - output_buf +# $a2 - output_col +# $a3 - outptr +# $a3, $at - mips_idct_coefs +# $t0:7 - simd data +# $t8 - coefficients, temp +# $t9 - loop end address +# $s0:3 - simd quantization factors +# $s4:7 - temp results +# s8 - const 0x80808080 +# $v0:1 - temp results + +SHIFT = 2 + + addiu $sp, $sp, -48 # reserve stack space for s0-s8 + + # save $a3 (mips_idct_coefs) because it might get clobbered below + sw $a3, 36($sp) + + sw $s0, 32($sp) + sw $s1, 28($sp) + sw $s2, 24($sp) + sw $s3, 20($sp) + sw $s4, 16($sp) + sw $s5, 12($sp) + sw $s6, 8($sp) + sw $s7, 4($sp) + sw $s8, 0($sp) + + addiu $t9, $a0, 128 # end address + + lui $s8, 0x8080 + ori $s8, $s8, 0x8080 + +loop_rows: + + lw $at, 36($sp) # restore saved $a3 (mips_idct_coefs) + + lw $t0, 0+0($a0) # wsptr[DCTSIZE*0+0/1] b a + lw $s0, 16+0($a0) # wsptr[DCTSIZE*1+0/1] B A + lw $t2, 0+4($a0) # wsptr[DCTSIZE*0+2/3] d c + lw $s2, 16+4($a0) # wsptr[DCTSIZE*1+2/3] D C + lw $t4, 0+8($a0) # wsptr[DCTSIZE*0+4/5] f e + lw $s4, 16+8($a0) # wsptr[DCTSIZE*1+4/5] F E + lw $t6, 0+12($a0) # wsptr[DCTSIZE*0+6/7] h g + lw $s6, 16+12($a0) # wsptr[DCTSIZE*1+6/7] H G + + precrq.ph.w $t1, $s0, $t0 # B b + ins $t0, $s0, 16, 16 # A a + + bnez $t1, full_row + or $s0, $t2, $s2 + bnez $s0, full_row + or $s0, $t4, $s4 + bnez $s0, full_row + or $s0, $t6, $s6 + bnez $s0, full_row + + shll_s.ph $s0, $t0, SHIFT # A a + + lw $a3, 0($a1) + lw $at, 4($a1) + + precrq.ph.w $t0, $s0, $s0 # A A + ins $s0, $s0, 16, 16 # a a + + addu $a3, $a3, $a2 + addu $at, $at, $a2 + + precrq.qb.ph $t0, $t0, $t0 # A A A A + precrq.qb.ph $s0, $s0, $s0 # a a a a + + + addu.qb $s0, $s0, $s8 + addu.qb $t0, $t0, $s8 + + + sw $s0, 0($a3) + sw $s0, 4($a3) + + sw $t0, 0($at) + sw $t0, 4($at) + + + addiu $a0, $a0, 32 + + bne $a0, $t9, loop_rows + addiu $a1, $a1, 8 + + b exit_rows + nop + + +full_row: + + precrq.ph.w $t3, $s2, $t2 + ins $t2, $s2, 16, 16 + + precrq.ph.w $t5, $s4, $t4 + ins $t4, $s4, 16, 16 + + precrq.ph.w $t7, $s6, $t6 + ins $t6, $s6, 16, 16 + + + lw $t8, 4($at) # FIX(1.414213562) + + addq.ph $s4, $t0, $t4 # tmp10 + subq.ph $s5, $t0, $t4 # tmp11 + + subq.ph $s6, $t2, $t6 # tmp12 ... + addq.ph $s7, $t2, $t6 # tmp13 + + mulq_rs.ph $s6, $s6, $t8 # ... tmp12 ... + + addq.ph $t0, $s4, $s7 # tmp0 + subq.ph $t6, $s4, $s7 # tmp3 + + shll_s.ph $s6, $s6, 1 # x2 + + subq.ph $s6, $s6, $s7 # ... tmp12 + + addq.ph $t2, $s5, $s6 # tmp1 + subq.ph $t4, $s5, $s6 # tmp2 + +################ + + addq.ph $s5, $t1, $t7 # z11 + subq.ph $s6, $t1, $t7 # z12 + + addq.ph $s7, $t5, $t3 # z13 + subq.ph $v0, $t5, $t3 # z10 + + addq.ph $t7, $s5, $s7 # tmp7 + subq.ph $s5, $s5, $s7 # tmp11 ... + + addq.ph $v1, $v0, $s6 # z5 ... + + mulq_rs.ph $s5, $s5, $t8 # ... tmp11 + + lw $t8, 8($at) # FIX(1.847759065) + lw $s4, 0($at) # FIX(1.082392200) + + addq.ph $s0, $t0, $t7 # tmp0 + tmp7 + subq.ph $s7, $t0, $t7 # tmp0 - tmp7 + + mulq_rs.ph $v1, $v1, $t8 # ... z5 + + lw $a3, 0($a1) + lw $t8, 12($at) # FIX(-2.613125930) + + shll_s.ph $s5, $s5, 1 # x2 + + addu $a3, $a3, $a2 + + mulq_rs.ph $v0, $v0, $t8 # tmp12 ... + mulq_rs.ph $s4, $s6, $s4 # tmp10 ... + + shll_s.ph $v1, $v1, 1 # x2 + + addiu $a0, $a0, 32 + addiu $a1, $a1, 8 + + + shll_s.ph $s6, $v0, 2 # x4 + shll_s.ph $s4, $s4, 1 # x2 + addq.ph $s6, $s6, $v1 # ... tmp12 + + shll_s.ph $s0, $s0, SHIFT + + subq.ph $t5, $s6, $t7 # tmp6 + subq.ph $s4, $s4, $v1 # ... tmp10 + subq.ph $t3, $s5, $t5 # tmp5 + + shll_s.ph $s7, $s7, SHIFT + + addq.ph $t1, $s4, $t3 # tmp4 + + + addq.ph $s1, $t2, $t5 # tmp1 + tmp6 + subq.ph $s6, $t2, $t5 # tmp1 - tmp6 + + addq.ph $s2, $t4, $t3 # tmp2 + tmp5 + subq.ph $s5, $t4, $t3 # tmp2 - tmp5 + + addq.ph $s4, $t6, $t1 # tmp3 + tmp4 + subq.ph $s3, $t6, $t1 # tmp3 - tmp4 + + + shll_s.ph $s1, $s1, SHIFT + shll_s.ph $s2, $s2, SHIFT + shll_s.ph $s3, $s3, SHIFT + shll_s.ph $s4, $s4, SHIFT + shll_s.ph $s5, $s5, SHIFT + shll_s.ph $s6, $s6, SHIFT + + + precrq.ph.w $t0, $s1, $s0 # B A + ins $s0, $s1, 16, 16 # b a + + precrq.ph.w $t2, $s3, $s2 # D C + ins $s2, $s3, 16, 16 # d c + + precrq.ph.w $t4, $s5, $s4 # F E + ins $s4, $s5, 16, 16 # f e + + precrq.ph.w $t6, $s7, $s6 # H G + ins $s6, $s7, 16, 16 # h g + + precrq.qb.ph $t0, $t2, $t0 # D C B A + precrq.qb.ph $s0, $s2, $s0 # d c b a + + precrq.qb.ph $t4, $t6, $t4 # H G F E + precrq.qb.ph $s4, $s6, $s4 # h g f e + + + addu.qb $s0, $s0, $s8 + addu.qb $s4, $s4, $s8 + + + sw $s0, 0($a3) # outptr[0/1/2/3] d c b a + sw $s4, 4($a3) # outptr[4/5/6/7] h g f e + + lw $a3, -4($a1) + + addu.qb $t0, $t0, $s8 + + addu $a3, $a3, $a2 + + addu.qb $t4, $t4, $s8 + + + sw $t0, 0($a3) # outptr[0/1/2/3] D C B A + + bne $a0, $t9, loop_rows + sw $t4, 4($a3) # outptr[4/5/6/7] H G F E + + +exit_rows: + + lw $s0, 32($sp) + lw $s1, 28($sp) + lw $s2, 24($sp) + lw $s3, 20($sp) + lw $s4, 16($sp) + lw $s5, 12($sp) + lw $s6, 8($sp) + lw $s7, 4($sp) + lw $s8, 0($sp) + + jr $ra + addiu $sp, $sp, 48 + + + .end mips_idct_rows diff --git a/mips_jidctfst.c b/mips_jidctfst.c new file mode 100644 index 0000000..1207575 --- /dev/null +++ b/mips_jidctfst.c @@ -0,0 +1,208 @@ +/* + * IDCT implementation using the MIPS DSP ASE (little endian version) + * + * jidctfst.c + * + * Copyright (C) 1994-1998, Thomas G. Lane. + * This file is part of the Independent JPEG Group's software. + * For conditions of distribution and use, see the accompanying README file. + * + * This file contains a fast, not so accurate integer implementation of the + * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine + * must also perform dequantization of the input coefficients. + * + * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT + * on each row (or vice versa, but it's more convenient to emit a row at + * a time). Direct algorithms are also available, but they are much more + * complex and seem not to be any faster when reduced to code. + * + * This implementation is based on Arai, Agui, and Nakajima's algorithm for + * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in + * Japanese, but the algorithm is described in the Pennebaker & Mitchell + * JPEG textbook (see REFERENCES section in file README). The following code + * is based directly on figure 4-8 in P&M. + * While an 8-point DCT cannot be done in less than 11 multiplies, it is + * possible to arrange the computation so that many of the multiplies are + * simple scalings of the final outputs. These multiplies can then be + * folded into the multiplications or divisions by the JPEG quantization + * table entries. The AA&N method leaves only 5 multiplies and 29 adds + * to be done in the DCT itself. + * The primary disadvantage of this method is that with fixed-point math, + * accuracy is lost due to imprecise representation of the scaled + * quantization values. The smaller the quantization table entry, the less + * precise the scaled value, so this implementation does worse with high- + * quality-setting files than with low-quality ones. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jdct.h" /* Private declarations for DCT subsystem */ + +#ifdef DCT_IFAST_SUPPORTED + + +/* + * This module is specialized to the case DCTSIZE = 8. + */ + +#if DCTSIZE != 8 + Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ +#endif + + +/* Scaling decisions are generally the same as in the LL&M algorithm; + * see jidctint.c for more details. However, we choose to descale + * (right shift) multiplication products as soon as they are formed, + * rather than carrying additional fractional bits into subsequent additions. + * This compromises accuracy slightly, but it lets us save a few shifts. + * More importantly, 16-bit arithmetic is then adequate (for 8-bit samples) + * everywhere except in the multiplications proper; this saves a good deal + * of work on 16-bit-int machines. + * + * The dequantized coefficients are not integers because the AA&N scaling + * factors have been incorporated. We represent them scaled up by PASS1_BITS, + * so that the first and second IDCT rounds have the same input scaling. + * For 8-bit JSAMPLEs, we choose IFAST_SCALE_BITS = PASS1_BITS so as to + * avoid a descaling shift; this compromises accuracy rather drastically + * for small quantization table entries, but it saves a lot of shifts. + * For 12-bit JSAMPLEs, there's no hope of using 16x16 multiplies anyway, + * so we use a much larger scaling factor to preserve accuracy. + * + * A final compromise is to represent the multiplicative constants to only + * 8 fractional bits, rather than 13. This saves some shifting work on some + * machines, and may also reduce the cost of multiplication (since there + * are fewer one-bits in the constants). + */ + +#if BITS_IN_JSAMPLE == 8 +#define CONST_BITS 8 +#define PASS1_BITS 2 +#else +#define CONST_BITS 8 +#define PASS1_BITS 1 /* lose a little precision to avoid overflow */ +#endif + +/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus + * causing a lot of useless floating-point operations at run time. + * To get around this we use the following pre-calculated constants. + * If you change CONST_BITS you may want to add appropriate values. + * (With a reasonable C compiler, you can just rely on the FIX() macro...) + */ + +#if CONST_BITS == 8 +#define FIX_1_082392200 ((INT32) 277) /* FIX(1.082392200) */ +#define FIX_1_414213562 ((INT32) 362) /* FIX(1.414213562) */ +#define FIX_1_847759065 ((INT32) 473) /* FIX(1.847759065) */ +#define FIX_2_613125930 ((INT32) 669) /* FIX(2.613125930) */ +#else +#define FIX_1_082392200 FIX(1.082392200) +#define FIX_1_414213562 FIX(1.414213562) +#define FIX_1_847759065 FIX(1.847759065) +#define FIX_2_613125930 FIX(2.613125930) +#endif + + +/* We can gain a little more speed, with a further compromise in accuracy, + * by omitting the addition in a descaling shift. This yields an incorrectly + * rounded result half the time... + */ + +#ifndef USE_ACCURATE_ROUNDING +#undef DESCALE +#define DESCALE(x,n) RIGHT_SHIFT(x, n) +#endif + + +/* Multiply a DCTELEM variable by an INT32 constant, and immediately + * descale to yield a DCTELEM result. + */ + +#define MULTIPLY(var,const) ((DCTELEM) DESCALE((var) * (const), CONST_BITS)) + + +/* Dequantize a coefficient by multiplying it by the multiplier-table + * entry; produce a DCTELEM result. For 8-bit data a 16x16->16 + * multiplication will do. For 12-bit data, the multiplier table is + * declared INT32, so a 32-bit multiply will be used. + */ + +#if BITS_IN_JSAMPLE == 8 +#define DEQUANTIZE(coef,quantval) (((IFAST_MULT_TYPE) (coef)) * (quantval)) +#else +#define DEQUANTIZE(coef,quantval) \ + DESCALE((coef)*(quantval), IFAST_SCALE_BITS-PASS1_BITS) +#endif + + +/* Like DESCALE, but applies to a DCTELEM and produces an int. + * We assume that int right shift is unsigned if INT32 right shift is. + */ + +#ifdef RIGHT_SHIFT_IS_UNSIGNED +#define ISHIFT_TEMPS DCTELEM ishift_temp; +#if BITS_IN_JSAMPLE == 8 +#define DCTELEMBITS 16 /* DCTELEM may be 16 or 32 bits */ +#else +#define DCTELEMBITS 32 /* DCTELEM must be 32 bits */ +#endif +#define IRIGHT_SHIFT(x,shft) \ + ((ishift_temp = (x)) < 0 ? \ + (ishift_temp >> (shft)) | ((~((DCTELEM) 0)) << (DCTELEMBITS-(shft))) : \ + (ishift_temp >> (shft))) +#else +#define ISHIFT_TEMPS +#define IRIGHT_SHIFT(x,shft) ((x) >> (shft)) +#endif + +#ifdef USE_ACCURATE_ROUNDING +#define IDESCALE(x,n) ((int) IRIGHT_SHIFT((x) + (1 << ((n)-1)), n)) +#else +#define IDESCALE(x,n) ((int) IRIGHT_SHIFT(x, n)) +#endif + + +// this table of constants has been moved from mips_idct_le/_be.s to +// avoid having to make the assembler code position independent +static const int mips_idct_coefs[4] = { + 0x45464546, // FIX( 1.082392200 / 2) = 17734 = 0x4546 + 0x5A825A82, // FIX( 1.414213562 / 2) = 23170 = 0x5A82 + 0x76427642, // FIX( 1.847759065 / 2) = 30274 = 0x7642 + 0xAC61AC61 // FIX(-2.613125930 / 4) = -21407 = 0xAC61 +}; + +void mips_idct_columns(JCOEF * inptr, IFAST_MULT_TYPE * quantptr, + DCTELEM * wsptr, const int * mips_idct_coefs); +void mips_idct_rows(DCTELEM * wsptr, JSAMPARRAY output_buf, + JDIMENSION output_col, const int * mips_idct_coefs); + + +/* + * Perform dequantization and inverse DCT on one block of coefficients. + */ + +GLOBAL(void) +jpeg_idct_mips (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, JDIMENSION output_col) +{ + JCOEFPTR inptr; + IFAST_MULT_TYPE * quantptr; + DCTELEM workspace[DCTSIZE2]; /* buffers data between passes */ + + /* Pass 1: process columns from input, store into work array. */ + + inptr = coef_block; + quantptr = (IFAST_MULT_TYPE *) compptr->dct_table; + + mips_idct_columns(inptr, quantptr, workspace, mips_idct_coefs); + + /* Pass 2: process rows from work array, store into output array. */ + /* Note that we must descale the results by a factor of 8 == 2**3, */ + /* and also undo the PASS1_BITS scaling. */ + + mips_idct_rows(workspace, output_buf, output_col, mips_idct_coefs); + +} + +#endif /* DCT_IFAST_SUPPORTED */ |