diff options
author | Chris Blume <cblume@chromium.org> | 2019-12-04 11:18:49 +0000 |
---|---|---|
committer | Commit Bot <commit-bot@chromium.org> | 2019-12-04 11:18:49 +0000 |
commit | f262c1b3c4196a2fee98c113142faff525b8d884 (patch) | |
tree | f1a4a54b58d010618cbeab48dad0596ec1321dd1 /deflate.c | |
parent | e77e1c06c8881abff0c7418368d147ff4a474d08 (diff) | |
download | zlib-f262c1b3c4196a2fee98c113142faff525b8d884.tar.gz |
Remove use of inline ASM in insert_string_sse
It seems that some years ago clang@Windows didn't have the
proper intrinsic required, which prompted the use of inline
ASM.
It has a side effect in that it will allow compilation of the
optimized function within the same compilation unit while using regular
compiler flags (i.e. 'crc32' instruction on x86 requires some special
compiler flags).
Main issue is that inline ASM is blocked on dependencies (e.g. 'base')
that will be linked to NaCl.
The main idea here is to allow the whole Chromium code base to use the
highly optimized checksums in zlib (e.g. crc32 and Adler-32), exported
through an interface (i.e. base::Crc32()).
This patch fixes this issue by removing the use of inline ASM.
The workaround is to use clang/gcc 'target attributes' to instruct the
backend to use different code generation options for the optimized
function, see:
https://clang.llvm.org/docs/AttributeReference.html#target
NOTE: While testing on my personal Windows PC, VS2019 including
smmintrin.h was insufficient. I needed to explicitly include either
immintrin.h or nmmintrin.h. I expected I would need that, but it seems
to be working with just smmintrin.h.
Bug: 902789
Change-Id: Id692fb839e20b26f9ba8b45538e652d5b140cd36
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/1941688
Commit-Queue: Chris Blume <cblume@chromium.org>
Reviewed-by: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Original-Commit-Position: refs/heads/master@{#721425}
Cr-Mirrored-From: https://chromium.googlesource.com/chromium/src
Cr-Mirrored-Commit: 1cebcd57bc3d09c39783395e6b173ff1f358a91b
Diffstat (limited to 'deflate.c')
-rw-r--r-- | deflate.c | 68 |
1 files changed, 31 insertions, 37 deletions
@@ -52,6 +52,10 @@ #include "deflate.h" #include "x86.h" +#if defined(CRC32_SIMD_SSE42_PCLMUL) +#include <smmintrin.h> +#endif + #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) #include "contrib/optimizations/slide_hash_neon.h" #endif @@ -123,8 +127,31 @@ extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size); #define INLINE inline #endif -/* Inline optimisation */ -local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str); +/* Intel optimized insert_string. */ +#if defined(CRC32_SIMD_SSE42_PCLMUL) + +#if defined(__GNUC__) || defined(__clang__) +__attribute__((target("sse4.2"))) +#endif +local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str) +{ + Pos ret; + unsigned *ip, val, h = 0; + + ip = (unsigned *)&s->window[str]; + val = *ip; + + if (s->level >= 6) + val &= 0xFFFFFF; + + h = _mm_crc32_u32(h, val); + + ret = s->head[h & s->hash_mask]; + s->head[h & s->hash_mask] = str; + s->prev[str & s->w_mask] = ret; + return ret; +} +#endif /* =========================================================================== * Local data @@ -228,10 +255,11 @@ local INLINE Pos insert_string(deflate_state *const s, const Pos str) #if defined(CRC32_ARMV8_CRC32) if (arm_cpu_enable_crc32) return insert_string_arm(s, str); -#endif +#elif defined(CRC32_SIMD_SSE42_PCLMUL) if (x86_cpu_enable_simd) return insert_string_sse(s, str); #endif +#endif return insert_string_c(s, str); } @@ -2276,37 +2304,3 @@ local block_state deflate_huff(s, flush) FLUSH_BLOCK(s, 0); return block_done; } - -/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will - * use intrinsic without extra params - */ -local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str) -{ - Pos ret; - unsigned *ip, val, h = 0; - - ip = (unsigned *)&s->window[str]; - val = *ip; - - if (s->level >= 6) - val &= 0xFFFFFF; - -/* Windows clang should use inline asm */ -#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) - h = _mm_crc32_u32(h, val); -#elif defined(__i386__) || defined(__amd64__) - __asm__ __volatile__ ( - "crc32 %1,%0\n\t" - : "+r" (h) - : "r" (val) - ); -#else - /* This should never happen */ - assert(0); -#endif - - ret = s->head[h & s->hash_mask]; - s->head[h & s->hash_mask] = str; - s->prev[str & s->w_mask] = ret; - return ret; -} |