summaryrefslogtreecommitdiff
path: root/deflate.c
diff options
context:
space:
mode:
authorrobert.bradford <robert.bradford@intel.com>2014-11-05 06:59:34 -0800
committerCommit bot <commit-bot@chromium.org>2014-11-05 14:59:51 +0000
commit10dd686e77ff174530435aaed24160de9afb882d (patch)
treec1d1b7b2b66e4b8b2167986234382f9b93860b34 /deflate.c
parent3230118192e5332c934514c094f33581a355fb3d (diff)
downloadzlib-10dd686e77ff174530435aaed24160de9afb882d.tar.gz
Reland "Integrate SIMD optimisations for zlib"
This version uses a "pthread_once" implementation, using Windows synchronisation primitives, imported from tcmalloc. Previous CLs: https://codereview.chromium.org/677713002/ https://codereview.chromium.org/552123005 This version of the CL also runs fine on Windows Server 2003. These optimisations have been published on zlib mailing list and at https://github.com/jtkukunas/zlib/ This change merges the following optimisation patches: - "For x86, add CPUID check." - "Adds SSE2 optimized hash shifting to fill_window." - "add SSE4.2 optimized hash function" - "add PCLMULQDQ optimized CRC folding" From Jim Kukunas <james.t.kukunas@linux.intel.com>; and adapts them to the current zlib version in Chromium. The optimisations are enabled at runtime if all the necessary CPU features are present. As the optimisations require extra cflags to enable the compiler to use the instructions the optimisations are held in their own static library with a stub implementation to allow linking on other platforms. TEST=net_unittests(GZipUnitTest) passes, Chrome functions and performance improvement seen on RoboHornet benchmark on Linux Desktop BUG=401517 Review URL: https://codereview.chromium.org/678423002 Cr-Original-Commit-Position: refs/heads/master@{#302799} Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src Cr-Mirrored-Commit: 02a95e3084f979084fa8586e1718a6e6dd4c22da
Diffstat (limited to 'deflate.c')
-rw-r--r--deflate.c139
1 files changed, 113 insertions, 26 deletions
diff --git a/deflate.c b/deflate.c
index 8043e5b..55ec215 100644
--- a/deflate.c
+++ b/deflate.c
@@ -49,7 +49,10 @@
/* @(#) $Id$ */
+#include <assert.h>
+
#include "deflate.h"
+#include "x86.h"
const char deflate_copyright[] =
" deflate 1.2.5 Copyright 1995-2010 Jean-loup Gailly and Mark Adler ";
@@ -85,7 +88,7 @@ local block_state deflate_huff OF((deflate_state *s, int flush));
local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
-local int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+
#ifdef ASMV
void match_init OF((void)); /* asm code initialization */
uInt longest_match OF((deflate_state *s, IPos cur_match, int clas));
@@ -98,6 +101,23 @@ local void check_match OF((deflate_state *s, IPos start, IPos match,
int length));
#endif
+/* For fill_window_sse.c to use */
+ZLIB_INTERNAL int read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
+
+/* From crc32.c */
+extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
+extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
+extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
+
+#ifdef _MSC_VER
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+
+/* Inline optimisation */
+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
+
/* ===========================================================================
* Local data
*/
@@ -164,7 +184,6 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
*/
#define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
-
/* ===========================================================================
* Insert string str in the dictionary and set match_head to the previous head
* of the hash chain (the most recent string with same hash key). Return
@@ -175,17 +194,28 @@ struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
* input characters and the first MIN_MATCH bytes of str are valid
* (except for the last MIN_MATCH-1 bytes of the input file).
*/
+local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
+{
+ Pos ret;
+
+ UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
#ifdef FASTEST
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
+ ret = s->head[s->ins_h];
#else
-#define INSERT_STRING(s, str, match_head) \
- (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
- match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
- s->head[s->ins_h] = (Pos)(str))
+ ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
#endif
+ s->head[s->ins_h] = str;
+
+ return ret;
+}
+
+local INLINE Pos insert_string(deflate_state *const s, const Pos str)
+{
+ if (x86_cpu_enable_simd)
+ return insert_string_sse(s, str);
+ return insert_string_c(s, str);
+}
+
/* ===========================================================================
* Initialize the hash table (avoiding 64K overflow for 16 bit systems).
@@ -219,6 +249,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
const char *version;
int stream_size;
{
+ unsigned window_padding = 8;
deflate_state *s;
int wrap = 1;
static const char my_version[] = ZLIB_VERSION;
@@ -228,6 +259,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
* output size for (length,distance) codes is <= 24 bits.
*/
+ x86_check_features();
+
if (version == Z_NULL || version[0] != my_version[0] ||
stream_size != sizeof(z_stream)) {
return Z_VERSION_ERROR;
@@ -274,12 +307,17 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
s->w_size = 1 << s->w_bits;
s->w_mask = s->w_size - 1;
- s->hash_bits = memLevel + 7;
+ if (x86_cpu_enable_simd) {
+ s->hash_bits = 15;
+ } else {
+ s->hash_bits = memLevel + 7;
+ }
+
s->hash_size = 1 << s->hash_bits;
s->hash_mask = s->hash_size - 1;
s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
- s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
+ s->window = (Bytef *) ZALLOC(strm, s->w_size + window_padding, 2*sizeof(Byte));
s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
s->head = (Posf *) ZALLOC(strm, s->hash_size, sizeof(Pos));
s->class_bitmap = NULL;
@@ -347,7 +385,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
s->ins_h = s->window[0];
UPDATE_HASH(s, s->ins_h, s->window[1]);
for (n = 0; n <= length - MIN_MATCH; n++) {
- INSERT_STRING(s, n, hash_head);
+ insert_string(s, n);
}
if (hash_head) hash_head = 0; /* to make compiler happy */
return Z_OK;
@@ -613,7 +651,7 @@ int ZEXPORT deflate (strm, flush)
if (s->status == INIT_STATE) {
#ifdef GZIP
if (s->wrap == 2) {
- strm->adler = crc32(0L, Z_NULL, 0);
+ crc_reset(s);
put_byte(s, 31);
put_byte(s, 139);
put_byte(s, 8);
@@ -891,6 +929,7 @@ int ZEXPORT deflate (strm, flush)
/* Write the trailer */
#ifdef GZIP
if (s->wrap == 2) {
+ crc_finalize(s);
put_byte(s, (Byte)(strm->adler & 0xff));
put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
@@ -1013,7 +1052,7 @@ int ZEXPORT deflateCopy (dest, source)
* allocating a large strm->next_in buffer and copying from it.
* (See also flush_pending()).
*/
-local int read_buf(strm, buf, size)
+ZLIB_INTERNAL int read_buf(strm, buf, size)
z_streamp strm;
Bytef *buf;
unsigned size;
@@ -1025,15 +1064,17 @@ local int read_buf(strm, buf, size)
strm->avail_in -= len;
- if (strm->state->wrap == 1) {
- strm->adler = adler32(strm->adler, strm->next_in, len);
- }
#ifdef GZIP
- else if (strm->state->wrap == 2) {
- strm->adler = crc32(strm->adler, strm->next_in, len);
+ if (strm->state->wrap == 2) {
+ copy_with_crc(strm, buf, len);
}
+ else
#endif
- zmemcpy(buf, strm->next_in, len);
+ {
+ zmemcpy(buf, strm->next_in, len);
+ if (strm->state->wrap == 1)
+ strm->adler = adler32(strm->adler, buf, len);
+ }
strm->next_in += len;
strm->total_in += len;
@@ -1445,7 +1486,19 @@ local void check_match(s, start, match, length)
* performed for at least two bytes (required for the zip translate_eol
* option -- not supported here).
*/
-local void fill_window(s)
+local void fill_window_c(deflate_state *s);
+
+local void fill_window(deflate_state *s)
+{
+ if (x86_cpu_enable_simd) {
+ fill_window_sse(s);
+ return;
+ }
+
+ fill_window_c(s);
+}
+
+local void fill_window_c(s)
deflate_state *s;
{
register unsigned n, m;
@@ -1711,7 +1764,7 @@ local block_state deflate_fast(s, flush, clas)
*/
hash_head = NIL;
if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
/* Find the longest match, discarding those <= prev_length.
@@ -1742,7 +1795,7 @@ local block_state deflate_fast(s, flush, clas)
s->match_length--; /* string at strstart already in table */
do {
s->strstart++;
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
/* strstart never exceeds WSIZE-MAX_MATCH, so there are
* always MIN_MATCH bytes ahead.
*/
@@ -1821,7 +1874,7 @@ local block_state deflate_slow(s, flush, clas)
*/
hash_head = NIL;
if (s->lookahead >= MIN_MATCH) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
/* Find the longest match, discarding those <= prev_length.
@@ -1890,7 +1943,7 @@ local block_state deflate_slow(s, flush, clas)
s->prev_length -= 2;
do {
if (++s->strstart <= max_insert) {
- INSERT_STRING(s, s->strstart, hash_head);
+ hash_head = insert_string(s, s->strstart);
}
} while (--s->prev_length != 0);
s->match_available = 0;
@@ -2031,3 +2084,37 @@ local block_state deflate_huff(s, flush)
FLUSH_BLOCK(s, flush == Z_FINISH);
return flush == Z_FINISH ? finish_done : block_done;
}
+
+/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
+ * use intrinsic without extra params
+ */
+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
+{
+ Pos ret;
+ unsigned *ip, val, h = 0;
+
+ ip = (unsigned *)&s->window[str];
+ val = *ip;
+
+ if (s->level >= 6)
+ val &= 0xFFFFFF;
+
+/* Windows clang should use inline asm */
+#if defined(_MSC_VER) && !defined(__clang__)
+ h = _mm_crc32_u32(h, val);
+#elif defined(__i386__) || defined(__amd64__)
+ __asm__ __volatile__ (
+ "crc32 %1,%0\n\t"
+ : "+r" (h)
+ : "r" (val)
+ );
+#else
+ /* This should never happen */
+ assert(0);
+#endif
+
+ ret = s->head[h & s->hash_mask];
+ s->head[h & s->hash_mask] = str;
+ s->prev[str & s->w_mask] = ret;
+ return ret;
+}