aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornoel@chromium.org <noel@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>2014-05-23 23:38:59 +0000
committernoel@chromium.org <noel@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>2014-05-23 23:38:59 +0000
commit841fff8cddd73c0d6b966902f83bea7ad366bd4b (patch)
treee6fcd3fcd0400cbd15dff946aaf9e46e82bafbef
parent3395bcc26e390d2960d15020d4a4d27ae0c122fe (diff)
downloadlibjpeg_turbo-841fff8cddd73c0d6b966902f83bea7ad366bd4b.tar.gz
Update libjpeg_turbo to use clz for bitcounting on ARM
Cherry-picked r1220 from upstream: Use clz/bsr instructions on ARM for bit counting rather than the lookup table (reduces memory footprint and can improve performance in some cases.) Upstream review: http://sourceforge.net/p/libjpeg-turbo/patches/57/ Original review: https://codereview.appspot.com/77480045/ Removing the lookup table saves 64k data for each process that uses jpeg encoding. Benchmarks on a few ARM devices shows encoding performance changes, from a slowdown of 3-4% on some devices, to a speedup of 10-20% on other devices. In average performance improves. x86 will still use the lookup table because the bsr instruction showed to be slower on some chips. BUG= R=noel@chromium.org Review URL: https://codereview.appspot.com/97690043 git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libjpeg_turbo@272637 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
-rw-r--r--README.chromium1
-rw-r--r--google.patch75
-rw-r--r--jchuff.c34
3 files changed, 108 insertions, 2 deletions
diff --git a/README.chromium b/README.chromium
index a595954..b01cb0a 100644
--- a/README.chromium
+++ b/README.chromium
@@ -9,6 +9,7 @@ Description:
This consists of the components:
* A partial copy of libjpeg-turbo 1.3.1 (r1219);
* Revision r1188 cherry-picked from upstream trunk into config.h;
+* Revision r1220 cherry-picked from upstream trunk into jchuff.c;
* A build file (libjpeg.gyp), and;
* Patched header files used by Chromium.
diff --git a/google.patch b/google.patch
index 9fcfe9b..8a9179d 100644
--- a/google.patch
+++ b/google.patch
@@ -1767,3 +1767,78 @@ Index: jdhuff.c
METHODDEF(boolean)
decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
+Index: jchuff.c
+===================================================================
+--- jchuff.c (revision 1219)
++++ jchuff.c (revision 1220)
+@@ -22,8 +22,36 @@
+ #include "jchuff.h" /* Declarations shared with jcphuff.c */
+ #include <limits.h>
+
++/*
++ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
++ * used for bit counting rather than the lookup table. This will reduce the
++ * memory footprint by 64k, which is important for some mobile applications
++ * that create many isolated instances of libjpeg-turbo (web browsers, for
++ * instance.) This may improve performance on some mobile platforms as well.
++ * This feature is enabled by default only on ARM processors, because some x86
++ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
++ * shown to have a significant performance impact even on the x86 chips that
++ * have a fast implementation of it. When building for ARMv6, you can
++ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
++ * flags (this defines __thumb__).
++ */
++
++/* NOTE: Both GCC and Clang define __GNUC__ */
++#if defined __GNUC__ && defined __arm__
++#if !defined __thumb__ || defined __thumb2__
++#define USE_CLZ_INTRINSIC
++#endif
++#endif
++
++#ifdef USE_CLZ_INTRINSIC
++#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
++#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
++#else
+ static unsigned char jpeg_nbits_table[65536];
+ static int jpeg_nbits_table_init = 0;
++#define JPEG_NBITS(x) (jpeg_nbits_table[x])
++#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
++#endif
+
+ #ifndef min
+ #define min(a,b) ((a)<(b)?(a):(b))
+@@ -272,6 +300,7 @@
+ dtbl->ehufsi[i] = huffsize[p];
+ }
+
++#ifndef USE_CLZ_INTRINSIC
+ if(!jpeg_nbits_table_init) {
+ for(i = 0; i < 65536; i++) {
+ int nbits = 0, temp = i;
+@@ -280,6 +309,7 @@
+ }
+ jpeg_nbits_table_init = 1;
+ }
++#endif
+ }
+
+
+@@ -482,7 +512,7 @@
+ temp2 += temp3;
+
+ /* Find the number of bits needed for the magnitude of the coefficient */
+- nbits = jpeg_nbits_table[temp];
++ nbits = JPEG_NBITS(temp);
+
+ /* Emit the Huffman-coded symbol for the number of bits */
+ code = dctbl->ehufco[nbits];
+@@ -516,7 +546,7 @@
+ temp ^= temp3; \
+ temp -= temp3; \
+ temp2 += temp3; \
+- nbits = jpeg_nbits_table[temp]; \
++ nbits = JPEG_NBITS_NONZERO(temp); \
+ /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
+ while (r > 15) { \
+ EMIT_BITS(code_0xf0, size_0xf0) \
diff --git a/jchuff.c b/jchuff.c
index 29bf389..68e4e0e 100644
--- a/jchuff.c
+++ b/jchuff.c
@@ -22,8 +22,36 @@
#include "jchuff.h" /* Declarations shared with jcphuff.c */
#include <limits.h>
+/*
+ * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
+ * used for bit counting rather than the lookup table. This will reduce the
+ * memory footprint by 64k, which is important for some mobile applications
+ * that create many isolated instances of libjpeg-turbo (web browsers, for
+ * instance.) This may improve performance on some mobile platforms as well.
+ * This feature is enabled by default only on ARM processors, because some x86
+ * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
+ * shown to have a significant performance impact even on the x86 chips that
+ * have a fast implementation of it. When building for ARMv6, you can
+ * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
+ * flags (this defines __thumb__).
+ */
+
+/* NOTE: Both GCC and Clang define __GNUC__ */
+#if defined __GNUC__ && defined __arm__
+#if !defined __thumb__ || defined __thumb2__
+#define USE_CLZ_INTRINSIC
+#endif
+#endif
+
+#ifdef USE_CLZ_INTRINSIC
+#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
+#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
+#else
static unsigned char jpeg_nbits_table[65536];
static int jpeg_nbits_table_init = 0;
+#define JPEG_NBITS(x) (jpeg_nbits_table[x])
+#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
+#endif
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
@@ -272,6 +300,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
dtbl->ehufsi[i] = huffsize[p];
}
+#ifndef USE_CLZ_INTRINSIC
if(!jpeg_nbits_table_init) {
for(i = 0; i < 65536; i++) {
int nbits = 0, temp = i;
@@ -280,6 +309,7 @@ jpeg_make_c_derived_tbl (j_compress_ptr cinfo, boolean isDC, int tblno,
}
jpeg_nbits_table_init = 1;
}
+#endif
}
@@ -482,7 +512,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
temp2 += temp3;
/* Find the number of bits needed for the magnitude of the coefficient */
- nbits = jpeg_nbits_table[temp];
+ nbits = JPEG_NBITS(temp);
/* Emit the Huffman-coded symbol for the number of bits */
code = dctbl->ehufco[nbits];
@@ -516,7 +546,7 @@ encode_one_block (working_state * state, JCOEFPTR block, int last_dc_val,
temp ^= temp3; \
temp -= temp3; \
temp2 += temp3; \
- nbits = jpeg_nbits_table[temp]; \
+ nbits = JPEG_NBITS_NONZERO(temp); \
/* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
while (r > 15) { \
EMIT_BITS(code_0xf0, size_0xf0) \