summaryrefslogtreecommitdiff
path: root/contrib/optimizations
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/optimizations')
-rw-r--r--contrib/optimizations/chunkcopy.h20
-rw-r--r--contrib/optimizations/inffast_chunk.c29
-rw-r--r--contrib/optimizations/inflate.c77
-rw-r--r--contrib/optimizations/insert_string.h62
-rw-r--r--contrib/optimizations/slide_hash_neon.h65
5 files changed, 132 insertions, 121 deletions
diff --git a/contrib/optimizations/chunkcopy.h b/contrib/optimizations/chunkcopy.h
index 9c0b7cb..f40546d 100644
--- a/contrib/optimizations/chunkcopy.h
+++ b/contrib/optimizations/chunkcopy.h
@@ -1,6 +1,6 @@
/* chunkcopy.h -- fast chunk copy and set operations
* Copyright (C) 2017 ARM, Inc.
- * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
@@ -36,6 +36,17 @@ typedef __m128i z_vec128i_t;
#endif
/*
+ * Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
+ */
+#define Z_DISABLE_MSAN
+#if defined(__has_feature)
+ #if __has_feature(memory_sanitizer)
+ #undef Z_DISABLE_MSAN
+ #define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
+ #endif
+#endif
+
+/*
* chunk copy type: the z_vec128i_t type size should be exactly 128-bits
* and equal to CHUNKCOPY_CHUNK_SIZE.
*/
@@ -49,7 +60,7 @@ Z_STATIC_ASSERT(vector_128_bits_wide,
* instruction appropriate for the z_vec128i_t type.
*/
static inline z_vec128i_t loadchunk(
- const unsigned char FAR* s) {
+ const unsigned char FAR* s) Z_DISABLE_MSAN {
z_vec128i_t v;
Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
return v;
@@ -82,7 +93,7 @@ static inline void storechunk(
static inline unsigned char FAR* chunkcopy_core(
unsigned char FAR* out,
const unsigned char FAR* from,
- unsigned len) {
+ unsigned len) Z_DISABLE_MSAN {
const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
storechunk(out, loadchunk(from));
out += bump;
@@ -152,7 +163,7 @@ static inline unsigned char FAR* chunkcopy_core_safe(
static inline unsigned char FAR* chunkunroll_relaxed(
unsigned char FAR* out,
unsigned FAR* dist,
- unsigned FAR* len) {
+ unsigned FAR* len) Z_DISABLE_MSAN {
const unsigned char FAR* from = out - *dist;
while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
storechunk(out, loadchunk(from));
@@ -473,5 +484,6 @@ typedef unsigned long inflate_holder_t;
#undef Z_STATIC_ASSERT
#undef Z_RESTRICT
#undef Z_BUILTIN_MEMCPY
+#undef Z_DISABLE_MSAN
#endif /* CHUNKCOPY_H */
diff --git a/contrib/optimizations/inffast_chunk.c b/contrib/optimizations/inffast_chunk.c
index 4bacbc4..5b09487 100644
--- a/contrib/optimizations/inffast_chunk.c
+++ b/contrib/optimizations/inffast_chunk.c
@@ -95,7 +95,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
code const FAR *dcode; /* local strm->distcode */
unsigned lmask; /* mask for first level of length codes */
unsigned dmask; /* mask for first level of distance codes */
- code here; /* retrieved table entry */
+ code const *here; /* retrieved table entry */
unsigned op; /* code bits, operation, extra bits, or */
/* window position, window bytes to copy */
unsigned len; /* match length, unused bytes */
@@ -139,20 +139,20 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
bits += 8;
#endif
}
- here = lcode[hold & lmask];
+ here = lcode + (hold & lmask);
dolen:
- op = (unsigned)(here.bits);
+ op = (unsigned)(here->bits);
hold >>= op;
bits -= op;
- op = (unsigned)(here.op);
+ op = (unsigned)(here->op);
if (op == 0) { /* literal */
- Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
+ Tracevv((stderr, here->val >= 0x20 && here->val < 0x7f ?
"inflate: literal '%c'\n" :
- "inflate: literal 0x%02x\n", here.val));
- *out++ = (unsigned char)(here.val);
+ "inflate: literal 0x%02x\n", here->val));
+ *out++ = (unsigned char)(here->val);
}
else if (op & 16) { /* length base */
- len = (unsigned)(here.val);
+ len = (unsigned)(here->val);
op &= 15; /* number of extra bits */
if (op) {
if (bits < op) {
@@ -182,14 +182,14 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
bits += 8;
#endif
}
- here = dcode[hold & dmask];
+ here = dcode + (hold & dmask);
dodist:
- op = (unsigned)(here.bits);
+ op = (unsigned)(here->bits);
hold >>= op;
bits -= op;
- op = (unsigned)(here.op);
+ op = (unsigned)(here->op);
if (op & 16) { /* distance base */
- dist = (unsigned)(here.val);
+ dist = (unsigned)(here->val);
op &= 15; /* number of extra bits */
if (bits < op) {
#ifdef INFLATE_CHUNK_READ_64LE
@@ -295,7 +295,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */
- here = dcode[here.val + (hold & ((1U << op) - 1))];
+ here = dcode + here->val + (hold & ((1U << op) - 1));
goto dodist;
}
else {
@@ -305,7 +305,7 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
}
}
else if ((op & 64) == 0) { /* 2nd level length code */
- here = lcode[here.val + (hold & ((1U << op) - 1))];
+ here = lcode + here->val + (hold & ((1U << op) - 1));
goto dolen;
}
else if (op & 32) { /* end-of-block */
@@ -339,7 +339,6 @@ unsigned start; /* inflate()'s starting value for strm->avail_out */
state->bits = bits;
Assert((state->hold >> state->bits) == 0, "invalid input data state");
- return;
}
/*
diff --git a/contrib/optimizations/inflate.c b/contrib/optimizations/inflate.c
index 81d558b..f3dfba8 100644
--- a/contrib/optimizations/inflate.c
+++ b/contrib/optimizations/inflate.c
@@ -1,5 +1,5 @@
/* inflate.c -- zlib decompression
- * Copyright (C) 1995-2016 Mark Adler
+ * Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
@@ -131,6 +131,7 @@ z_streamp strm;
state->mode = HEAD;
state->last = 0;
state->havedict = 0;
+ state->flags = -1;
state->dmax = 32768U;
state->head = Z_NULL;
state->hold = 0;
@@ -168,6 +169,8 @@ int windowBits;
/* extract wrap request from windowBits parameter */
if (windowBits < 0) {
+ if (windowBits < -15)
+ return Z_STREAM_ERROR;
wrap = 0;
windowBits = -windowBits;
}
@@ -459,10 +462,10 @@ unsigned copy;
/* check function to use adler32() for zlib or crc32() for gzip */
#ifdef GUNZIP
-# define UPDATE(check, buf, len) \
+# define UPDATE_CHECK(check, buf, len) \
(state->flags ? crc32(check, buf, len) : adler32(check, buf, len))
#else
-# define UPDATE(check, buf, len) adler32(check, buf, len)
+# define UPDATE_CHECK(check, buf, len) adler32(check, buf, len)
#endif
/* check macros for header crc */
@@ -682,7 +685,6 @@ int flush;
state->mode = FLAGS;
break;
}
- state->flags = 0; /* expect zlib header */
if (state->head != Z_NULL)
state->head->done = -1;
if (!(state->wrap & 1) || /* check if zlib header allowed */
@@ -709,6 +711,7 @@ int flush;
break;
}
state->dmax = 1U << len;
+ state->flags = 0; /* indicate zlib header */
Tracev((stderr, "inflate: zlib header ok\n"));
strm->adler = state->check = adler32(0L, Z_NULL, 0);
state->mode = hold & 0x200 ? DICTID : TYPE;
@@ -734,6 +737,7 @@ int flush;
CRC2(state->check, hold);
INITBITS();
state->mode = TIME;
+ /* fallthrough */
case TIME:
NEEDBITS(32);
if (state->head != Z_NULL)
@@ -742,6 +746,7 @@ int flush;
CRC4(state->check, hold);
INITBITS();
state->mode = OS;
+ /* fallthrough */
case OS:
NEEDBITS(16);
if (state->head != Z_NULL) {
@@ -752,6 +757,7 @@ int flush;
CRC2(state->check, hold);
INITBITS();
state->mode = EXLEN;
+ /* fallthrough */
case EXLEN:
if (state->flags & 0x0400) {
NEEDBITS(16);
@@ -765,14 +771,16 @@ int flush;
else if (state->head != Z_NULL)
state->head->extra = Z_NULL;
state->mode = EXTRA;
+ /* fallthrough */
case EXTRA:
if (state->flags & 0x0400) {
copy = state->length;
if (copy > have) copy = have;
if (copy) {
if (state->head != Z_NULL &&
- state->head->extra != Z_NULL) {
- len = state->head->extra_len - state->length;
+ state->head->extra != Z_NULL &&
+ (len = state->head->extra_len - state->length) <
+ state->head->extra_max) {
zmemcpy(state->head->extra + len, next,
len + copy > state->head->extra_max ?
state->head->extra_max - len : copy);
@@ -787,6 +795,7 @@ int flush;
}
state->length = 0;
state->mode = NAME;
+ /* fallthrough */
case NAME:
if (state->flags & 0x0800) {
if (have == 0) goto inf_leave;
@@ -808,6 +817,7 @@ int flush;
state->head->name = Z_NULL;
state->length = 0;
state->mode = COMMENT;
+ /* fallthrough */
case COMMENT:
if (state->flags & 0x1000) {
if (have == 0) goto inf_leave;
@@ -828,6 +838,7 @@ int flush;
else if (state->head != Z_NULL)
state->head->comment = Z_NULL;
state->mode = HCRC;
+ /* fallthrough */
case HCRC:
if (state->flags & 0x0200) {
NEEDBITS(16);
@@ -851,6 +862,7 @@ int flush;
strm->adler = state->check = ZSWAP32(hold);
INITBITS();
state->mode = DICT;
+ /* fallthrough */
case DICT:
if (state->havedict == 0) {
RESTORE();
@@ -858,8 +870,10 @@ int flush;
}
strm->adler = state->check = adler32(0L, Z_NULL, 0);
state->mode = TYPE;
+ /* fallthrough */
case TYPE:
if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave;
+ /* fallthrough */
case TYPEDO:
if (state->last) {
BYTEBITS();
@@ -910,8 +924,10 @@ int flush;
INITBITS();
state->mode = COPY_;
if (flush == Z_TREES) goto inf_leave;
+ /* fallthrough */
case COPY_:
state->mode = COPY;
+ /* fallthrough */
case COPY:
copy = state->length;
if (copy) {
@@ -947,6 +963,7 @@ int flush;
Tracev((stderr, "inflate: table sizes ok\n"));
state->have = 0;
state->mode = LENLENS;
+ /* fallthrough */
case LENLENS:
while (state->have < state->ncode) {
NEEDBITS(3);
@@ -968,6 +985,7 @@ int flush;
Tracev((stderr, "inflate: code lengths ok\n"));
state->have = 0;
state->mode = CODELENS;
+ /* fallthrough */
case CODELENS:
while (state->have < state->nlen + state->ndist) {
for (;;) {
@@ -1027,11 +1045,11 @@ int flush;
}
/* build code tables -- note: do not change the lenbits or distbits
- values here (9 and 6) without reading the comments in inftrees.h
+ values here (10 and 9) without reading the comments in inftrees.h
concerning the ENOUGH constants, which depend on those values */
state->next = state->codes;
state->lencode = (const code FAR *)(state->next);
- state->lenbits = 9;
+ state->lenbits = 10;
ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
&(state->lenbits), state->work);
if (ret) {
@@ -1040,7 +1058,7 @@ int flush;
break;
}
state->distcode = (const code FAR *)(state->next);
- state->distbits = 6;
+ state->distbits = 9;
ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
&(state->next), &(state->distbits), state->work);
if (ret) {
@@ -1051,8 +1069,10 @@ int flush;
Tracev((stderr, "inflate: codes ok\n"));
state->mode = LEN_;
if (flush == Z_TREES) goto inf_leave;
+ /* fallthrough */
case LEN_:
state->mode = LEN;
+ /* fallthrough */
case LEN:
if (have >= INFLATE_FAST_MIN_INPUT &&
left >= INFLATE_FAST_MIN_OUTPUT) {
@@ -1103,6 +1123,7 @@ int flush;
}
state->extra = (unsigned)(here.op) & 15;
state->mode = LENEXT;
+ /* fallthrough */
case LENEXT:
if (state->extra) {
NEEDBITS(state->extra);
@@ -1113,6 +1134,7 @@ int flush;
Tracevv((stderr, "inflate: length %u\n", state->length));
state->was = state->length;
state->mode = DIST;
+ /* fallthrough */
case DIST:
for (;;) {
here = state->distcode[BITS(state->distbits)];
@@ -1140,6 +1162,7 @@ int flush;
state->offset = (unsigned)here.val;
state->extra = (unsigned)(here.op) & 15;
state->mode = DISTEXT;
+ /* fallthrough */
case DISTEXT:
if (state->extra) {
NEEDBITS(state->extra);
@@ -1156,6 +1179,7 @@ int flush;
#endif
Tracevv((stderr, "inflate: distance %u\n", state->offset));
state->mode = MATCH;
+ /* fallthrough */
case MATCH:
if (left == 0) goto inf_leave;
copy = out - left;
@@ -1214,7 +1238,7 @@ int flush;
state->total += out;
if ((state->wrap & 4) && out)
strm->adler = state->check =
- UPDATE(state->check, put - out, out);
+ UPDATE_CHECK(state->check, put - out, out);
out = left;
if ((state->wrap & 4) && (
#ifdef GUNZIP
@@ -1230,10 +1254,11 @@ int flush;
}
#ifdef GUNZIP
state->mode = LENGTH;
+ /* fallthrough */
case LENGTH:
if (state->wrap && state->flags) {
NEEDBITS(32);
- if (hold != (state->total & 0xffffffffUL)) {
+ if ((state->wrap & 4) && hold != (state->total & 0xffffffff)) {
strm->msg = (char *)"incorrect length check";
state->mode = BAD;
break;
@@ -1243,6 +1268,7 @@ int flush;
}
#endif
state->mode = DONE;
+ /* fallthrough */
case DONE:
ret = Z_STREAM_END;
goto inf_leave;
@@ -1252,6 +1278,7 @@ int flush;
case MEM:
return Z_MEM_ERROR;
case SYNC:
+ /* fallthrough */
default:
return Z_STREAM_ERROR;
}
@@ -1263,16 +1290,29 @@ int flush;
Note: a memory error from inflate() is non-recoverable.
*/
inf_leave:
- /* We write a defined value in the unused space to help mark
+#if defined(ZLIB_DEBUG)
+ /* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
+ * client code relying on undefined behavior when chunk_copy first landed.
+ *
+ * It is save to say after all these years that Chromium code is well
+ * behaved and works fine with the optimization, therefore we can enable
+ * this only for DEBUG builds.
+ *
+ * We write a defined value in the unused space to help mark
* where the stream has ended. We don't use zeros as that can
* mislead clients relying on undefined behavior (i.e. assuming
* that the data is over when the buffer has a zero/null value).
+ *
+ * The basic idea is that if client code is not relying on the zlib context
+ * to inform the amount of decompressed data, but instead reads the output
+ * buffer until a zero/null is found, it will fail faster and harder
+ * when the remaining of the buffer is marked with a symbol (e.g. 0x55).
*/
if (left >= CHUNKCOPY_CHUNK_SIZE)
memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
else
memset(put, 0x55, left);
-
+#endif
RESTORE();
if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
(state->mode < CHECK || flush != Z_FINISH)))
@@ -1287,7 +1327,7 @@ int flush;
state->total += out;
if ((state->wrap & 4) && out)
strm->adler = state->check =
- UPDATE(state->check, strm->next_out - out, out);
+ UPDATE_CHECK(state->check, strm->next_out - out, out);
strm->data_type = (int)state->bits + (state->last ? 64 : 0) +
(state->mode == TYPE ? 128 : 0) +
(state->mode == LEN_ || state->mode == COPY_ ? 256 : 0);
@@ -1423,6 +1463,7 @@ int ZEXPORT inflateSync(strm)
z_streamp strm;
{
unsigned len; /* number of bytes to look at or looked at */
+ int flags; /* temporary to save header status */
unsigned long in, out; /* temporary to save total_in and total_out */
unsigned char buf[4]; /* to restore bit buffer to byte string */
struct inflate_state FAR *state;
@@ -1455,9 +1496,15 @@ z_streamp strm;
/* return no joy or set up to restart inflate() on a new block */
if (state->have != 4) return Z_DATA_ERROR;
+ if (state->flags == -1)
+ state->wrap = 0; /* if no header yet, treat as raw */
+ else
+ state->wrap &= ~4; /* no point in computing a check value now */
+ flags = state->flags;
in = strm->total_in; out = strm->total_out;
inflateReset(strm);
strm->total_in = in; strm->total_out = out;
+ state->flags = flags;
state->mode = TYPE;
return Z_OK;
}
@@ -1553,7 +1600,7 @@ int check;
if (inflateStateCheck(strm)) return Z_STREAM_ERROR;
state = (struct inflate_state FAR *)strm->state;
- if (check)
+ if (check && state->wrap)
state->wrap |= 4;
else
state->wrap &= ~4;
diff --git a/contrib/optimizations/insert_string.h b/contrib/optimizations/insert_string.h
index 9f634ae..c6a296a 100644
--- a/contrib/optimizations/insert_string.h
+++ b/contrib/optimizations/insert_string.h
@@ -1,15 +1,20 @@
/* insert_string.h
*
- * Copyright 2019 The Chromium Authors. All rights reserved.
+ * Copyright 2019 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
-#if defined(_MSC_VER)
+#ifndef INSERT_STRING_H
+#define INSERT_STRING_H
+
+#ifndef INLINE
+#if defined(_MSC_VER) && !defined(__clang__)
#define INLINE __inline
#else
#define INLINE inline
#endif
+#endif
#include "cpu_features.h"
@@ -23,7 +28,8 @@
#define TARGET_CPU_WITH_CRC
#endif
- #define _cpu_crc32_u32 _mm_crc32_u32
+ /* CRC32C uint32_t */
+ #define _cpu_crc32c_hash_u32 _mm_crc32_u32
#elif defined(CRC32_ARMV8_CRC32)
#if defined(__clang__)
@@ -40,7 +46,8 @@
#define TARGET_CPU_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif // defined(__aarch64__)
- #define _cpu_crc32_u32 __crc32cw
+ /* CRC32C uint32_t */
+ #define _cpu_crc32c_hash_u32 __crc32cw
#endif
// clang-format on
@@ -50,20 +57,15 @@
TARGET_CPU_WITH_CRC
local INLINE Pos insert_string_simd(deflate_state* const s, const Pos str) {
Pos ret;
- unsigned *ip, val, h = 0;
+ unsigned val, h = 0;
- ip = (unsigned*)&s->window[str];
- val = *ip;
+ zmemcpy(&val, &s->window[str], sizeof(val));
if (s->level >= 6)
val &= 0xFFFFFF;
- /* Unlike the case of data integrity checks for GZIP format where the
- * polynomial used is defined (https://tools.ietf.org/html/rfc1952#page-11),
- * here it is just a hash function for the hash table used while
- * performing compression.
- */
- h = _cpu_crc32_u32(h, val);
+ /* Compute hash from the CRC32C of |val|. */
+ h = _cpu_crc32c_hash_u32(h, val);
ret = s->head[h & s->hash_mask];
s->head[h & s->hash_mask] = str;
@@ -73,8 +75,22 @@ local INLINE Pos insert_string_simd(deflate_state* const s, const Pos str) {
#endif // TARGET_CPU_WITH_CRC
+/**
+ * Some applications need to match zlib DEFLATE output exactly [3]. Use the
+ * canonical zlib Rabin-Karp rolling hash [1,2] in that case.
+ *
+ * [1] For a description of the Rabin and Karp algorithm, see "Algorithms"
+ * book by R. Sedgewick, Addison-Wesley, p252.
+ * [2] https://www.euccas.me/zlib/#zlib_rabin_karp and also "rolling hash"
+ * https://en.wikipedia.org/wiki/Rolling_hash
+ * [3] crbug.com/1316541 AOSP incremental client APK package OTA upgrades.
+ */
+#ifdef CHROMIUM_ZLIB_NO_CASTAGNOLI
+#define USE_ZLIB_RABIN_KARP_ROLLING_HASH
+#endif
+
/* ===========================================================================
- * Update a hash value with the given input byte
+ * Update a hash value with the given input byte (Rabin-Karp rolling hash).
* IN assertion: all calls to UPDATE_HASH are made with consecutive input
* characters, so that a running hash key can be computed from the previous
* key instead of complete recalculation each time.
@@ -106,16 +122,16 @@ local INLINE Pos insert_string_c(deflate_state* const s, const Pos str) {
}
local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
-/* insert_string_simd string dictionary insertion: this SIMD symbol hashing
+/* insert_string_simd string dictionary insertion: SIMD crc32c symbol hasher
* significantly improves data compression speed.
*
- * Note: the generated compressed output is a valid DEFLATE stream but will
- * differ from vanilla zlib output ...
+ * Note: the generated compressed output is a valid DEFLATE stream, but will
+ * differ from canonical zlib output.
*/
-#if defined(CHROMIUM_ZLIB_NO_CASTAGNOLI)
-/* ... so this build-time option can used to disable the SIMD symbol hasher
- * if matching vanilla zlib DEFLATE output is required.
- */ (;) /* FALLTHOUGH */
+#if defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
+/* So this build-time option can be used to disable the crc32c hash, and use
+ * the Rabin-Karp hash instead.
+ */ /* FALLTHROUGH Rabin-Karp */
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
if (x86_cpu_enable_simd)
return insert_string_simd(s, str);
@@ -123,5 +139,7 @@ local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
if (arm_cpu_enable_crc32)
return insert_string_simd(s, str);
#endif
- return insert_string_c(s, str);
+ return insert_string_c(s, str); /* Rabin-Karp */
}
+
+#endif /* INSERT_STRING_H */
diff --git a/contrib/optimizations/slide_hash_neon.h b/contrib/optimizations/slide_hash_neon.h
deleted file mode 100644
index 26995d7..0000000
--- a/contrib/optimizations/slide_hash_neon.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2018 The Chromium Authors. All rights reserved.
- * Use of this source code is governed by a BSD-style license that can be
- * found in the Chromium source repository LICENSE file.
- */
-#ifndef __SLIDE_HASH__NEON__
-#define __SLIDE_HASH__NEON__
-
-#include "deflate.h"
-#include <arm_neon.h>
-
-inline static void ZLIB_INTERNAL neon_slide_hash_update(Posf *hash,
- const uInt hash_size,
- const ush w_size)
-{
- /* NEON 'Q' registers allow to store 128 bits, so we can load 8x16-bits
- * values. For further details, check:
- * ARM DHT 0002A, section 1.3.2 NEON Registers.
- */
- const size_t chunk = sizeof(uint16x8_t) / sizeof(uint16_t);
- /* Unrolling the operation yielded a compression performance boost in both
- * ARMv7 (from 11.7% to 13.4%) and ARMv8 (from 3.7% to 7.5%) for HTML4
- * content. For full benchmarking data, check: http://crbug.com/863257.
- */
- const size_t stride = 2*chunk;
- const uint16x8_t v = vdupq_n_u16(w_size);
-
- for (Posf *end = hash + hash_size; hash != end; hash += stride) {
- uint16x8_t m_low = vld1q_u16(hash);
- uint16x8_t m_high = vld1q_u16(hash + chunk);
-
- /* The first 'q' in vqsubq_u16 makes these subtracts saturate to zero,
- * replacing the ternary operator expression in the original code:
- * (m >= wsize ? m - wsize : NIL).
- */
- m_low = vqsubq_u16(m_low, v);
- m_high = vqsubq_u16(m_high, v);
-
- vst1q_u16(hash, m_low);
- vst1q_u16(hash + chunk, m_high);
- }
-}
-
-
-inline static void ZLIB_INTERNAL neon_slide_hash(Posf *head, Posf *prev,
- const unsigned short w_size,
- const uInt hash_size)
-{
- /*
- * SIMD implementation for hash table rebase assumes:
- * 1. hash chain offset (Pos) is 2 bytes.
- * 2. hash table size is multiple of 32 bytes.
- * #1 should be true as Pos is defined as "ush"
- * #2 should be true as hash_bits are greater than 7
- */
- const size_t size = hash_size * sizeof(head[0]);
- Assert(sizeof(Pos) == 2, "Wrong Pos size.");
- Assert((size % sizeof(uint16x8_t) * 2) == 0, "Hash table size error.");
-
- neon_slide_hash_update(head, hash_size, w_size);
-#ifndef FASTEST
- neon_slide_hash_update(prev, w_size, w_size);
-#endif
-}
-
-#endif