diff options
author | noel@chromium.org <noel@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> | 2014-05-23 23:38:59 +0000 |
---|---|---|
committer | noel@chromium.org <noel@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> | 2014-05-23 23:38:59 +0000 |
commit | 841fff8cddd73c0d6b966902f83bea7ad366bd4b (patch) | |
tree | e6fcd3fcd0400cbd15dff946aaf9e46e82bafbef | |
parent | 3395bcc26e390d2960d15020d4a4d27ae0c122fe (diff) | |
download | libjpeg_turbo-841fff8cddd73c0d6b966902f83bea7ad366bd4b.tar.gz |
Update libjpeg_turbo to use clz for bitcounting on ARM
Cherry-picked r1220 from upstream:
Use clz/bsr instructions on ARM for bit counting rather than the lookup table (reduces memory footprint and can improve performance in some cases.)
Upstream review:
http://sourceforge.net/p/libjpeg-turbo/patches/57/
Original review:
https://codereview.appspot.com/77480045/
Removing the lookup table saves 64k data for each process that uses jpeg encoding. Benchmarks on a few ARM devices shows encoding performance changes, from a slowdown of 3-4% on some devices, to a speedup of 10-20% on other devices. In average performance improves.
x86 will still use the lookup table because the bsr instruction showed to be slower on some chips.
BUG=
R=noel@chromium.org
Review URL: https://codereview.appspot.com/97690043
git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo@272637 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
-rw-r--r-- | README.chromium | 1 | ||||
-rw-r--r-- | google.patch | 75 | ||||
-rw-r--r-- | jchuff.c | 34 |
3 files changed, 108 insertions, 2 deletions
diff --git a/README.chromium b/README.chromium index a595954..b01cb0a 100644 --- a/README.chromium +++ b/README.chromium @@ -9,6 +9,7 @@ Description: This consists of the components: * A partial copy of libjpeg-turbo 1.3.1 (r1219); * Revision r1188 cherry-picked from upstream trunk into config.h; +* Revision r1220 cherry-picked from upstream trunk into jchuff.c; * A build file (libjpeg.gyp), and; * Patched header files used by Chromium. diff --git a/google.patch b/google.patch index 9fcfe9b..8a9179d 100644 --- a/google.patch +++ b/google.patch @@ -1767,3 +1767,78 @@ Index: jdhuff.c METHODDEF(boolean) decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data) +Index: jchuff.c +=================================================================== +--- jchuff.c (revision 1219) ++++ jchuff.c (revision 1220) +@@ -22,8 +22,36 @@ + #include "jchuff.h" /* Declarations shared with jcphuff.c */ + #include <limits.h> + ++/* ++ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be ++ * used for bit counting rather than the lookup table. This will reduce the ++ * memory footprint by 64k, which is important for some mobile applications ++ * that create many isolated instances of libjpeg-turbo (web browsers, for ++ * instance.) This may improve performance on some mobile platforms as well. ++ * This feature is enabled by default only on ARM processors, because some x86 ++ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be ++ * shown to have a significant performance impact even on the x86 chips that ++ * have a fast implementation of it. When building for ARMv6, you can ++ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler ++ * flags (this defines __thumb__). ++ */ ++ ++/* NOTE: Both GCC and Clang define __GNUC__ */ ++#if defined __GNUC__ && defined __arm__ ++#if !defined __thumb__ || defined __thumb2__ ++#define USE_CLZ_INTRINSIC ++#endif ++#endif ++ ++#ifdef USE_CLZ_INTRINSIC ++#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) ++#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) ++#else + static unsigned char jpeg_nbits_table[65536]; + static int jpeg_nbits_table_init = 0; ++#define JPEG_NBITS(x) (jpeg_nbits_table[x]) ++#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) ++#endif + + #ifndef min + #define min(a,b) ((a)<(b)?(a):(b)) +@@ -272,6 +300,7 @@ + dtbl->ehufsi[i] = huffsize[p]; + } + ++#ifndef USE_CLZ_INTRINSIC + if(!jpeg_nbits_table_init) { + for(i = 0; i < 65536; i++) { + int nbits = 0, temp = i; +@@ -280,6 +309,7 @@ + } + jpeg_nbits_table_init = 1; + } ++#endif + } + + +@@ -482,7 +512,7 @@ + temp2 += temp3; + + /* Find the number of bits needed for the magnitude of the coefficient */ +- nbits = jpeg_nbits_table[temp]; ++ nbits = JPEG_NBITS(temp); + + /* Emit the Huffman-coded symbol for the number of bits */ + code = dctbl->ehufco[nbits]; +@@ -516,7 +546,7 @@ + temp ^= temp3; \ + temp -= temp3; \ + temp2 += temp3; \ +- nbits = jpeg_nbits_table[temp]; \ ++ nbits = JPEG_NBITS_NONZERO(temp); \ + /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ + while (r > 15) { \ + EMIT_BITS(code_0xf0, size_0xf0) \ @@ -22,8 +22,36 @@ #include "jchuff.h" /* Declarations shared with jcphuff.c */ #include <limits.h> +/* + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be + * used for bit counting rather than the lookup table. This will reduce the + * memory footprint by 64k, which is important for some mobile applications + * that create many isolated instances of libjpeg-turbo (web browsers, for + * instance.) This may improve performance on some mobile platforms as well. + * This feature is enabled by default only on ARM processors, because some x86 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be + * shown to have a significant performance impact even on the x86 chips that + * have a fast implementation of it. When building for ARMv6, you can + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler + * flags (this defines __thumb__). + */ + +/* NOTE: Both GCC and Clang define __GNUC__ */ +#if defined __GNUC__ && defined __arm__ +#if !defined __thumb__ || defined __thumb2__ +#define USE_CLZ_INTRINSIC +#endif +#endif + +#ifdef USE_CLZ_INTRINSIC +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x)) +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0) +#else static unsigned char jpeg_nbits_table[65536]; static int jpeg_nbits_table_init = 0; +#define JPEG_NBITS(x) (jpeg_nbits_table[x]) +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x) +#endif #ifndef min #define min(a,b) ((a)<(b)?(a):(b)) @@ -272,6 +300,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, dtbl->ehufsi[i] = huffsize[p]; } +#ifndef USE_CLZ_INTRINSIC if(!jpeg_nbits_table_init) { for(i = 0; i < 65536; i++) { int nbits = 0, temp = i; @@ -280,6 +309,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno, } jpeg_nbits_table_init = 1; } +#endif } @@ -482,7 +512,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, temp2 += temp3; /* Find the number of bits needed for the magnitude of the coefficient */ - nbits = jpeg_nbits_table[temp]; + nbits = JPEG_NBITS(temp); /* Emit the Huffman-coded symbol for the number of bits */ code = dctbl->ehufco[nbits]; @@ -516,7 +546,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val, temp ^= temp3; \ temp -= temp3; \ temp2 += temp3; \ - nbits = jpeg_nbits_table[temp]; \ + nbits = JPEG_NBITS_NONZERO(temp); \ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \ while (r > 15) { \ EMIT_BITS(code_0xf0, size_0xf0) \ |