diff options
Diffstat (limited to 'src/zlib-ng/arch/x86')
29 files changed, 1818 insertions, 518 deletions
diff --git a/src/zlib-ng/arch/x86/INDEX.md b/src/zlib-ng/arch/x86/INDEX.md deleted file mode 100644 index 8bf6d08..0000000 --- a/src/zlib-ng/arch/x86/INDEX.md +++ /dev/null @@ -1,8 +0,0 @@ -Contents --------- - -|Name|Description| -|:-|:-| -|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1| -|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation| -|slide_sse2.c|SSE2 optimized slide_hash| diff --git a/src/zlib-ng/arch/x86/Makefile.in b/src/zlib-ng/arch/x86/Makefile.in index 13c736c..f9aedf8 100644 --- a/src/zlib-ng/arch/x86/Makefile.in +++ b/src/zlib-ng/arch/x86/Makefile.in @@ -8,11 +8,15 @@ SFLAGS= INCLUDES= SUFFIX= +AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw +AVX512VNNIFLAG=-mavx512vnni AVX2FLAG=-mavx2 SSE2FLAG=-msse2 SSSE3FLAG=-mssse3 -SSE4FLAG=-msse4 +SSE41FLAG=-msse4.1 +SSE42FLAG=-msse4.2 PCLMULFLAG=-mpclmul +VPCLMULFLAG=-mvpclmulqdq NOLTOFLAG= SRCDIR=. @@ -20,23 +24,28 @@ SRCTOP=../.. TOPDIR=$(SRCTOP) all: \ - x86.o x86.lo \ - adler32_avx.o adler32.lo \ + x86_features.o x86_features.lo \ + adler32_avx2.o adler32_avx2.lo \ + adler32_avx512.o adler32_avx512.lo \ + adler32_avx512_vnni.o adler32_avx512_vnni.lo \ + adler32_sse42.o adler32_sse42.lo \ adler32_ssse3.o adler32_ssse3.lo \ chunkset_avx.o chunkset_avx.lo \ - chunkset_sse.o chunkset_sse.lo \ - compare258_avx.o compare258_avx.lo \ - compare258_sse.o compare258_sse.lo \ - insert_string_sse.o insert_string_sse.lo \ - crc_folding.o crc_folding.lo \ - slide_avx.o slide_avx.lo \ - slide_sse.o slide_sse.lo - -x86.o: - $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c - -x86.lo: - $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c + chunkset_sse2.o chunkset_sse2.lo \ + chunkset_sse41.o chunkset_sse41.lo \ + compare256_avx2.o compare256_avx2.lo \ + compare256_sse2.o compare256_sse2.lo \ + insert_string_sse42.o insert_string_sse42.lo \ + crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \ + crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \ + slide_hash_avx2.o slide_hash_avx2.lo \ + slide_hash_sse2.o slide_hash_sse2.lo + +x86_features.o: + $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c + +x86_features.lo: + $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c chunkset_avx.o: $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c @@ -44,59 +53,89 @@ chunkset_avx.o: chunkset_avx.lo: $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c -chunkset_sse.o: - $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c +chunkset_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c + +chunkset_sse41.o: + $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c + +chunkset_sse41.lo: + $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c + +compare256_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c -chunkset_sse.lo: - $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c +compare256_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c -compare258_avx.o: - $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c +compare256_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c -compare258_avx.lo: - $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c +compare256_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c -compare258_sse.o: - $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c +insert_string_sse42.o: + $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c -compare258_sse.lo: - $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c +insert_string_sse42.lo: + $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c -insert_string_sse.o: - $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c +crc32_fold_pclmulqdq.o: + $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c -insert_string_sse.lo: - $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c +crc32_fold_pclmulqdq.lo: + $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c -crc_folding.o: - $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +crc32_fold_vpclmulqdq.o: + $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c -crc_folding.lo: - $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c +crc32_fold_vpclmulqdq.lo: + $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c -slide_avx.o: - $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c +slide_hash_avx2.o: + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c -slide_avx.lo: - $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c +slide_hash_avx2.lo: + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c -slide_sse.o: - $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c +slide_hash_sse2.o: + $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c -slide_sse.lo: - $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c +slide_hash_sse2.lo: + $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c -adler32_avx.o: $(SRCDIR)/adler32_avx.c - $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c +adler32_avx2.o: $(SRCDIR)/adler32_avx2.c + $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c -adler32_avx.lo: $(SRCDIR)/adler32_avx.c - $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c +adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c + $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c + +adler32_avx512.o: $(SRCDIR)/adler32_avx512.c + $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c + $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c + +adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c + +adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c + $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c $(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c - $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c + +adler32_sse42.o: $(SRCDIR)/adler32_sse42.c + $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c + +adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c + $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c mostlyclean: clean clean: diff --git a/src/zlib-ng/arch/x86/adler32_avx.c b/src/zlib-ng/arch/x86/adler32_avx.c deleted file mode 100644 index 1063246..0000000 --- a/src/zlib-ng/arch/x86/adler32_avx.c +++ /dev/null @@ -1,117 +0,0 @@ -/* adler32.c -- compute the Adler-32 checksum of a data stream - * Copyright (C) 1995-2011 Mark Adler - * Authors: - * Brian Bockelman <bockelman@gmail.com> - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "../../zbuild.h" -#include "../../zutil.h" - -#include "../../adler32_p.h" - -#include <immintrin.h> - -#ifdef X86_AVX2_ADLER32 - -Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) { - uint32_t sum2; - - /* split Adler-32 into component sums */ - sum2 = (adler >> 16) & 0xffff; - adler &= 0xffff; - - /* in case user likes doing a byte at a time, keep it fast */ - if (UNLIKELY(len == 1)) - return adler32_len_1(adler, buf, sum2); - - /* initial Adler-32 value (deferred check for len == 1 speed) */ - if (UNLIKELY(buf == NULL)) - return 1L; - - /* in case short lengths are provided, keep it somewhat fast */ - if (UNLIKELY(len < 16)) - return adler32_len_16(adler, buf, len, sum2); - - uint32_t ALIGNED_(32) s1[8], s2[8]; - - memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster? - memset(s2, 0, sizeof(s2)); s2[7] = sum2; - - char ALIGNED_(32) dot1[32] = \ - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256i dot1v = _mm256_load_si256((__m256i*)dot1); - char ALIGNED_(32) dot2[32] = \ - {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m256i dot2v = _mm256_load_si256((__m256i*)dot2); - short ALIGNED_(32) dot3[16] = \ - {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m256i dot3v = _mm256_load_si256((__m256i*)dot3); - - // We will need to multiply by - char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); - - while (len >= 32) { - __m256i vs1 = _mm256_load_si256((__m256i*)s1); - __m256i vs2 = _mm256_load_si256((__m256i*)s2); - __m256i vs1_0 = vs1; - - int k = (len < NMAX ? (int)len : NMAX); - k -= k % 32; - len -= k; - - while (k >= 32) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - */ - __m256i vbuf = _mm256_loadu_si256((__m256i*)buf); - buf += 32; - k -= 32; - - __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); - vs1 = _mm256_add_epi32(vsum1, vs1); - __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm256_sll_epi32(vs1_0, shiftv); - vsum2 = _mm256_add_epi32(vsum2, vs2); - vs2 = _mm256_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - uint32_t ALIGNED_(32) s1_unpack[8]; - uint32_t ALIGNED_(32) s2_unpack[8]; - - _mm256_store_si256((__m256i*)s1_unpack, vs1); - _mm256_store_si256((__m256i*)s2_unpack, vs2); - - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + - (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); - adler %= BASE; - s1[7] = adler; - - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) + - (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE); - sum2 %= BASE; - s2[7] = sum2; - } - - while (len) { - len--; - adler += *buf++; - sum2 += adler; - } - adler %= BASE; - sum2 %= BASE; - - /* return recombined sums */ - return adler | (sum2 << 16); -} - -#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx2.c b/src/zlib-ng/arch/x86/adler32_avx2.c new file mode 100644 index 0000000..dcd1166 --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx2.c @@ -0,0 +1,17 @@ +/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include <immintrin.h> + +#ifdef X86_AVX2_ADLER32 + +#include "adler32_avx2_tpl.h" + +#define COPY +#include "adler32_avx2_tpl.h" + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx2_p.h b/src/zlib-ng/arch/x86/adler32_avx2_p.h new file mode 100644 index 0000000..f7079bf --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx2_p.h @@ -0,0 +1,32 @@ +/* adler32_avx2_p.h -- adler32 avx2 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_AVX2_P_H_ +#define ADLER32_AVX2_P_H_ + +#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32) + +/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */ +static inline uint32_t hsum256(__m256i x) { + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1), + _mm256_castsi256_si128(x)); + __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +static inline uint32_t partial_hsum256(__m256i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros */ + const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1); + __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec); + __m128i non_zero_sse = _mm256_castsi256_si128(non_zero); + __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse)); + __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} +#endif + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx2_tpl.h b/src/zlib-ng/arch/x86/adler32_avx2_tpl.h new file mode 100644 index 0000000..59cacfa --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx2_tpl.h @@ -0,0 +1,140 @@ +/* adler32_avx2_tpl.h -- adler32 avx2 vectorized function templates + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include <immintrin.h> +#include "../../adler32_fold.h" +#include "../../adler32_p.h" +#include "../../fallback_builtins.h" +#include "adler32_avx2_p.h" + +#ifdef X86_SSE42_ADLER32 +extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len); +extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len); +#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d) +#define sub32(a, b, c) adler32_ssse3(a, b, c) +#else +#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1) +#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1) +#endif + +#ifdef COPY +Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +#else +Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) { +#endif + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { +#ifdef COPY + return adler32_copy_len_16(adler0, src, dst, len, adler1); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + } else if (len < 32) { +#ifdef COPY + return copy_sub32(adler, dst, src, len); +#else + return sub32(adler, src, len); +#endif + } + + __m256i vs1, vs2; + + const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, + 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m256i dot3v = _mm256_set1_epi16(1); + const __m256i zero = _mm256_setzero_si256(); + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + + size_t k = MIN(len, NMAX); + k -= k % 32; + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] ) + */ + __m256i vbuf = _mm256_loadu_si256((__m256i*)src); + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's + // +#ifdef COPY + _mm256_storeu_si256((__m256i*)dst, vbuf); + dst += 32; +#endif + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts + __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s + vs2 = _mm256_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + /* Defer the multiplication with 32 to outside of the loop */ + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + + /* The compiler is generating the following sequence for this integer modulus + * when done the scalar way, in GPRs: + + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); + + mov $0x80078071,%edi // move magic constant into 32 bit register %edi + ... + vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi + mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax + imul %rdi,%rsi // do a signed multiplication with magic constant and vector element + shr $0x2f,%rsi // shift right by 47 + imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 + sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above + ... + // repeats for each element with vpextract instructions + + This is tricky with AVX2 for a number of reasons: + 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there + 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate + back down to 32 bit precision later (there is in AVX512) + 3.) Full width integer multiplications aren't cheap + + We can, however, and do a relatively cheap sequence for horizontal sums. + Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was + previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but + that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be + performed on the maximum possible inputs before overflow + */ + + + /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy + * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). + * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly + * what the compiler is doing to avoid integer divisions. */ + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + if (len) { + goto rem_peel; + } + + return adler; +} diff --git a/src/zlib-ng/arch/x86/adler32_avx512.c b/src/zlib-ng/arch/x86/adler32_avx512.c new file mode 100644 index 0000000..c0bf072 --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx512.c @@ -0,0 +1,16 @@ +/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512_ADLER32 + +#include "adler32_avx512_tpl.h" + +#define COPY +#include "adler32_avx512_tpl.h" + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx512_p.h b/src/zlib-ng/arch/x86/adler32_avx512_p.h new file mode 100644 index 0000000..5b79d2a --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx512_p.h @@ -0,0 +1,46 @@ +#ifndef AVX512_FUNCS_H +#define AVX512_FUNCS_H + +#include <immintrin.h> +#include <stdint.h> +/* Written because *_add_epi32(a) sets off ubsan */ +static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { + __m256i a = _mm512_extracti64x4_epi64(x, 1); + __m256i b = _mm512_extracti64x4_epi64(x, 0); + + __m256i a_plus_b = _mm256_add_epi32(a, b); + __m128i c = _mm256_extracti128_si256(a_plus_b, 1); + __m128i d = _mm256_extracti128_si256(a_plus_b, 0); + __m128i c_plus_d = _mm_add_epi32(c, d); + + __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d); + __m128i sum2 = _mm_add_epi32(sum1, c_plus_d); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + + return _mm_cvtsi128_si32(sum4); +} + +static inline uint32_t partial_hsum(__m512i x) { + /* We need a permutation vector to extract every other integer. The + * rest are going to be zeros. Marking this const so the compiler stands + * a better chance of keeping this resident in a register through entire + * loop execution. We certainly have enough zmm registers (32) */ + const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, + 1, 1, 1, 1, 1, 1, 1, 1); + + __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x); + + /* From here, it's a simple 256 bit wide reduction sum */ + __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); + + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is + * pretty slow, much slower than the longer instruction sequence below */ + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), + _mm256_castsi256_si128(non_zero_avx)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1)); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx512_tpl.h b/src/zlib-ng/arch/x86/adler32_avx512_tpl.h new file mode 100644 index 0000000..d324ce9 --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx512_tpl.h @@ -0,0 +1,106 @@ +/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../adler32_p.h" +#include "../../adler32_fold.h" +#include "../../cpu_features.h" +#include "../../fallback_builtins.h" +#include <immintrin.h> +#include "adler32_avx512_p.h" + +#ifdef X86_AVX512_ADLER32 + +#ifdef COPY +Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { +#else +Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) { +#endif + + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 64) { + /* This handles the remaining copies, just call normal adler checksum after this */ +#ifdef COPY + __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len)); + __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src); + _mm512_mask_storeu_epi8(dst, storemask, copy_vec); +#endif + +#ifdef X86_AVX2_ADLER32 + return adler32_avx2(adler, src, len); +#elif defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + } + + __m512i vbuf, vs1_0, vs3; + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + const __m512i dot3v = _mm512_set1_epi16(1); + const __m512i zero = _mm512_setzero_si512(); + size_t k; + + while (len >= 64) { + __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + vs1_0 = vs1; + vs3 = _mm512_setzero_si512(); + + k = MIN(len, NMAX); + k -= k % 64; + len -= k; + + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf = _mm512_loadu_si512(src); +#ifdef COPY + _mm512_storeu_si512(dst, vbuf); + dst += 64; +#endif + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero); + __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v); + vs1 = _mm512_add_epi32(vs1_sad, vs1); + vs3 = _mm512_add_epi32(vs3, vs1_0); + __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm512_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_avx512_vnni.c b/src/zlib-ng/arch/x86/adler32_avx512_vnni.c new file mode 100644 index 0000000..330bfe3 --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_avx512_vnni.c @@ -0,0 +1,225 @@ +/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream + * Based on Brian Bockelman's AVX2 version + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_AVX512VNNI_ADLER32 + +#include "../../zbuild.h" +#include "../../adler32_p.h" +#include "../../cpu_features.h" +#include "../../fallback_builtins.h" +#include <immintrin.h> +#include "../../adler32_fold.h" +#include "adler32_avx512_p.h" +#include "adler32_avx2_p.h" + +Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) { + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 32) +#if defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + + if (len < 64) +#ifdef X86_AVX2_ADLER32 + return adler32_avx2(adler, src, len); +#elif defined(X86_SSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + + const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, 64); + + const __m512i zero = _mm512_setzero_si512(); + __m512i vs1, vs2; + + while (len >= 64) { + vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0)); + vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1)); + size_t k = MIN(len, NMAX); + k -= k % 64; + len -= k; + __m512i vs1_0 = vs1; + __m512i vs3 = _mm512_setzero_si512(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m512i vs2_1 = _mm512_setzero_si512(); + __m512i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 128) { + vbuf1 = _mm512_loadu_si512((__m512i*)src); + + src += 64; + k -= 64; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 128) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm512_loadu_si512((__m512i*)src); + vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64)); + src += 128; + k -= 128; + + __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs3 = _mm512_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm512_add_epi32(vs3, vs1); + vs1_sad = _mm512_sad_epu8(vbuf1, zero); + vs1 = _mm512_add_epi32(vs1, vs1_sad); + vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm512_slli_epi32(vs3, 6); + vs2 = _mm512_add_epi32(vs2, vs3); + vs2 = _mm512_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = _mm512_reduce_add_epu32(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel; + } + + return adler; +} + +Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + if (src == NULL) return 1L; + if (len == 0) return adler; + + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel_copy: + if (len < 32) { + /* This handles the remaining copies, just call normal adler checksum after this */ + __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len)); + __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src); + _mm256_mask_storeu_epi8(dst, storemask, copy_vec); + +#if defined(X86_SSSE3_ADLER32) + return adler32_ssse3(adler, src, len); +#else + return adler32_len_16(adler0, src, len, adler1); +#endif + } + + const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32); + + const __m256i zero = _mm256_setzero_si256(); + __m256i vs1, vs2; + + while (len >= 32) { + vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0)); + vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1)); + size_t k = MIN(len, NMAX); + k -= k % 32; + len -= k; + __m256i vs1_0 = vs1; + __m256i vs3 = _mm256_setzero_si256(); + /* We might get a tad bit more ILP here if we sum to a second register in the loop */ + __m256i vs2_1 = _mm256_setzero_si256(); + __m256i vbuf0, vbuf1; + + /* Remainder peeling */ + if (k % 64) { + vbuf1 = _mm256_loadu_si256((__m256i*)src); + _mm256_storeu_si256((__m256i*)dst, vbuf1); + dst += 32; + + src += 32; + k -= 32; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v); + vs1_0 = vs1; + } + + /* Manually unrolled this loop by 2 for an decent amount of ILP */ + while (k >= 64) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] ) + */ + vbuf0 = _mm256_loadu_si256((__m256i*)src); + vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32)); + _mm256_storeu_si256((__m256i*)dst, vbuf0); + _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1); + dst += 64; + src += 64; + k -= 64; + + __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs3 = _mm256_add_epi32(vs3, vs1_0); + /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp + * instructions to eliminate them */ + vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v); + + vs3 = _mm256_add_epi32(vs3, vs1); + vs1_sad = _mm256_sad_epu8(vbuf1, zero); + vs1 = _mm256_add_epi32(vs1, vs1_sad); + vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v); + vs1_0 = vs1; + } + + vs3 = _mm256_slli_epi32(vs3, 5); + vs2 = _mm256_add_epi32(vs2, vs3); + vs2 = _mm256_add_epi32(vs2, vs2_1); + + adler0 = partial_hsum256(vs1) % BASE; + adler1 = hsum256(vs2) % BASE; + } + + adler = adler0 | (adler1 << 16); + + /* Process tail (len < 64). */ + if (len) { + goto rem_peel_copy; + } + + return adler; +} + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_sse42.c b/src/zlib-ng/arch/x86/adler32_sse42.c new file mode 100644 index 0000000..92efe4d --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_sse42.c @@ -0,0 +1,121 @@ +/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream + * Copyright (C) 1995-2011 Mark Adler + * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> + * Brian Bockelman <bockelman@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" +#include "../../adler32_p.h" +#include "../../adler32_fold.h" +#include "adler32_ssse3_p.h" +#include <immintrin.h> + +#ifdef X86_SSE42_ADLER32 + +Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) { + uint32_t adler0, adler1; + adler1 = (adler >> 16) & 0xffff; + adler0 = adler & 0xffff; + +rem_peel: + if (len < 16) { + return adler32_copy_len_16(adler0, src, dst, len, adler1); + } + + __m128i vbuf, vbuf_0; + __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + v_sad_sum2, vsum2, vsum2_0; + __m128i zero = _mm_setzero_si128(); + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + size_t k; + + while (len >= 16) { + + k = MIN(len, NMAX); + k -= k % 16; + len -= k; + + vs1 = _mm_cvtsi32_si128(adler0); + vs2 = _mm_cvtsi32_si128(adler1); + + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16)); + src += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + _mm_storeu_si128((__m128i*)dst, vbuf); + _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0); + dst += 32; + + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + vs2 = _mm_add_epi32(vsum2, vs2); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_loadu_si128((__m128i*)src); + src += 16; + k -= 16; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + + _mm_storeu_si128((__m128i*)dst, vbuf); + dst += 16; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + adler0 = partial_hsum(vs1) % BASE; + adler1 = hsum(vs2) % BASE; + } + + /* If this is true, there's fewer than 16 elements remaining */ + if (len) { + goto rem_peel; + } + + return adler0 | (adler1 << 16); +} + +#endif diff --git a/src/zlib-ng/arch/x86/adler32_ssse3.c b/src/zlib-ng/arch/x86/adler32_ssse3.c index 101df4f..8c55bad 100644 --- a/src/zlib-ng/arch/x86/adler32_ssse3.c +++ b/src/zlib-ng/arch/x86/adler32_ssse3.c @@ -1,14 +1,14 @@ -/* adler32.c -- compute the Adler-32 checksum of a data stream +/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream * Copyright (C) 1995-2011 Mark Adler * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> * Brian Bockelman <bockelman@gmail.com> * For conditions of distribution and use, see copyright notice in zlib.h */ #include "../../zbuild.h" -#include "../../zutil.h" - #include "../../adler32_p.h" +#include "adler32_ssse3_p.h" #ifdef X86_SSSE3_ADLER32 @@ -33,86 +33,124 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size if (UNLIKELY(len < 16)) return adler32_len_16(adler, buf, len, sum2); - uint32_t ALIGNED_(16) s1[4], s2[4]; - - s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; - s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; - - char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot1v = _mm_load_si128((__m128i*)dot1); - char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m128i dot2v = _mm_load_si128((__m128i*)dot2); - short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot3v = _mm_load_si128((__m128i*)dot3); - - // We will need to multiply by - //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + const __m128i zero = _mm_setzero_si128(); + + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; + + /* If our buffer is unaligned (likely), make the determination whether + * or not there's enough of a buffer to consume to make the scalar, aligning + * additions worthwhile or if it's worth it to just eat the cost of an unaligned + * load. This is a pretty simple test, just test if 16 - the remainder + len is + * < 16 */ + size_t max_iters = NMAX; + size_t rem = (uintptr_t)buf & 15; + size_t align_offset = 16 - rem; + size_t k = 0; + if (rem) { + if (len < 16 + align_offset) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; + } + + for (size_t i = 0; i < align_offset; ++i) { + adler += *(buf++); + sum2 += adler; + } + + /* lop off the max number of sums based on the scalar sums done + * above */ + len -= align_offset; + max_iters -= align_offset; + } - char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); while (len >= 16) { - __m128i vs1 = _mm_load_si128((__m128i*)s1); - __m128i vs2 = _mm_load_si128((__m128i*)s2); - __m128i vs1_0 = vs1; - - int k = (len < NMAX ? (int)len : NMAX); - k -= k % 16; - len -= k; - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - - NOTE: 256-bit equivalents are: - _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts - _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t. - We could rewrite the below to use 256-bit instructions instead of 128-bit. - */ - __m128i vbuf = _mm_loadu_si128((__m128i*)buf); - buf += 16; - k -= 16; - - __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vs1 = _mm_add_epi32(vsum1, vs1); - __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm_sll_epi32(vs1_0, shiftv); - vsum2 = _mm_add_epi32(vsum2, vs2); - vs2 = _mm_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - - uint32_t ALIGNED_(16) s1_unpack[4]; - uint32_t ALIGNED_(16) s2_unpack[4]; - - _mm_store_si128((__m128i*)s1_unpack, vs1); - _mm_store_si128((__m128i*)s2_unpack, vs2); - - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); - adler %= BASE; - s1[3] = adler; - - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); - sum2 %= BASE; - s2[3] = sum2; - } - - while (len) { - len--; - adler += *buf++; - sum2 += adler; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + k = (len < max_iters ? len : max_iters); + k -= k % 16; + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + buf += 16; + k -= 16; + +unaligned_jmp: + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + /* We don't actually need to do a full horizontal sum, since psadbw is actually doing + * a partial reduction sum implicitly and only summing to integers in vector positions + * 0 and 2. This saves us some contention on the shuffle port(s) */ + adler = partial_hsum(vs1) % BASE; + sum2 = hsum(vs2) % BASE; + max_iters = NMAX; } - adler %= BASE; - sum2 %= BASE; - /* return recombined sums */ - return adler | (sum2 << 16); + /* Process tail (len < 16). */ + return adler32_len_16(adler, buf, len, sum2); } #endif diff --git a/src/zlib-ng/arch/x86/adler32_ssse3_p.h b/src/zlib-ng/arch/x86/adler32_ssse3_p.h new file mode 100644 index 0000000..ba914e1 --- /dev/null +++ b/src/zlib-ng/arch/x86/adler32_ssse3_p.h @@ -0,0 +1,29 @@ +/* adler32_ssse3_p.h -- adler32 ssse3 utility functions + * Copyright (C) 2022 Adam Stylinski + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef ADLER32_SSSE3_P_H_ +#define ADLER32_SSSE3_P_H_ + +#ifdef X86_SSSE3_ADLER32 + +#include <immintrin.h> +#include <stdint.h> + +static inline uint32_t partial_hsum(__m128i x) { + __m128i second_int = _mm_bsrli_si128(x, 8); + __m128i sum = _mm_add_epi32(x, second_int); + return _mm_cvtsi128_si32(sum); +} + +static inline uint32_t hsum(__m128i x) { + __m128i sum1 = _mm_unpackhi_epi64(x, x); + __m128i sum2 = _mm_add_epi32(x, sum1); + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); + __m128i sum4 = _mm_add_epi32(sum2, sum3); + return _mm_cvtsi128_si32(sum4); +} +#endif + +#endif diff --git a/src/zlib-ng/arch/x86/chunk_permute_table.h b/src/zlib-ng/arch/x86/chunk_permute_table.h new file mode 100644 index 0000000..c7b2d2d --- /dev/null +++ b/src/zlib-ng/arch/x86/chunk_permute_table.h @@ -0,0 +1,53 @@ +/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifndef CHUNK_PERMUTE_TABLE_H_ +#define CHUNK_PERMUTE_TABLE_H_ + +#include "zbuild.h" + +/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */ +static const ALIGNED_(32) uint8_t permute_table[26*32] = { + 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */ + 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */ + 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */ + 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */ + + /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute + * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual + * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity, + * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but, + * this is what we're dealt. + */ + + 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */ + 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */ + 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */ + 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */ + 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */ + 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */ + 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */ + 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */ + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */ +}; + +typedef struct lut_rem_pair_s { + uint16_t idx; + uint16_t remval; +} lut_rem_pair; + +#endif diff --git a/src/zlib-ng/arch/x86/chunkset_avx.c b/src/zlib-ng/arch/x86/chunkset_avx.c index eb76c0d..91aaa45 100644 --- a/src/zlib-ng/arch/x86/chunkset_avx.c +++ b/src/zlib-ng/arch/x86/chunkset_avx.c @@ -2,32 +2,70 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" -#include "zutil.h" #ifdef X86_AVX_CHUNKSET #include <immintrin.h> +#include "chunk_permute_table.h" typedef __m256i chunk_t; -#define HAVE_CHUNKMEMSET_1 +#define CHUNK_SIZE 32 + #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG -static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { - *chunk = _mm256_set1_epi8(*(int8_t *)from); -} +/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can + * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */ +static const lut_rem_pair perm_idx_lut[29] = { + { 0, 2}, /* 3 */ + { 0, 0}, /* don't care */ + { 1 * 32, 2}, /* 5 */ + { 2 * 32, 2}, /* 6 */ + { 3 * 32, 4}, /* 7 */ + { 0 * 32, 0}, /* don't care */ + { 4 * 32, 5}, /* 9 */ + { 5 * 32, 22}, /* 10 */ + { 6 * 32, 21}, /* 11 */ + { 7 * 32, 20}, /* 12 */ + { 8 * 32, 6}, /* 13 */ + { 9 * 32, 4}, /* 14 */ + {10 * 32, 2}, /* 15 */ + { 0 * 32, 0}, /* don't care */ + {11 * 32, 15}, /* 17 */ + {11 * 32 + 16, 14}, /* 18 */ + {11 * 32 + 16 * 2, 13}, /* 19 */ + {11 * 32 + 16 * 3, 12}, /* 20 */ + {11 * 32 + 16 * 4, 11}, /* 21 */ + {11 * 32 + 16 * 5, 10}, /* 22 */ + {11 * 32 + 16 * 6, 9}, /* 23 */ + {11 * 32 + 16 * 7, 8}, /* 24 */ + {11 * 32 + 16 * 8, 7}, /* 25 */ + {11 * 32 + 16 * 9, 6}, /* 26 */ + {11 * 32 + 16 * 10, 5}, /* 27 */ + {11 * 32 + 16 * 11, 4}, /* 28 */ + {11 * 32 + 16 * 12, 3}, /* 29 */ + {11 * 32 + 16 * 13, 2}, /* 30 */ + {11 * 32 + 16 * 14, 1} /* 31 */ +}; static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - *chunk = _mm256_set1_epi16(*(int16_t *)from); + int16_t tmp; + zmemcpy_2(&tmp, from); + *chunk = _mm256_set1_epi16(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - *chunk = _mm256_set1_epi32(*(int32_t *)from); + int32_t tmp; + zmemcpy_4(&tmp, from); + *chunk = _mm256_set1_epi32(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = _mm256_set1_epi64x(*(int64_t *)from); + int64_t tmp; + zmemcpy_8(&tmp, from); + *chunk = _mm256_set1_epi64x(tmp); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { @@ -38,6 +76,50 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) { _mm256_storeu_si256((__m256i *)out, *chunk); } +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m256i ret_vec; + /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is + * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in + * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */ + *chunk_rem = lut_rem.remval; + +#ifdef Z_MEMORY_SANITIZER + /* See note in chunkset_sse4.c for why this is ok */ + __msan_unpoison(buf + dist, 32 - dist); +#endif + + if (dist < 16) { + /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after + * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate + * shuffles and combining the halves later */ + const __m256i permute_xform = + _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); + __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx)); + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + perm_vec = _mm256_add_epi8(perm_vec, permute_xform); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec); + } else if (dist == 16) { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1); + } else { + __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf); + __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16)); + /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */ + __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1); + __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1); + /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_ + * shuffle those values */ + __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes); + ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1); + } + + return ret_vec; +} + #define CHUNKSIZE chunksize_avx #define CHUNKCOPY chunkcopy_avx #define CHUNKCOPY_SAFE chunkcopy_safe_avx diff --git a/src/zlib-ng/arch/x86/chunkset_sse.c b/src/zlib-ng/arch/x86/chunkset_sse2.c index 1d5a0fa..be195cf 100644 --- a/src/zlib-ng/arch/x86/chunkset_sse.c +++ b/src/zlib-ng/arch/x86/chunkset_sse2.c @@ -1,34 +1,36 @@ -/* chunkset_sse.c -- SSE inline functions to copy small data chunks. +/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks. * For conditions of distribution and use, see copyright notice in zlib.h */ #include "zbuild.h" -#include "zutil.h" #ifdef X86_SSE2 #include <immintrin.h> typedef __m128i chunk_t; -#define HAVE_CHUNKMEMSET_1 +#define CHUNK_SIZE 16 + #define HAVE_CHUNKMEMSET_2 #define HAVE_CHUNKMEMSET_4 #define HAVE_CHUNKMEMSET_8 -static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { - *chunk = _mm_set1_epi8(*(int8_t *)from); -} - static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { - *chunk = _mm_set1_epi16(*(int16_t *)from); + int16_t tmp; + zmemcpy_2(&tmp, from); + *chunk = _mm_set1_epi16(tmp); } static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { - *chunk = _mm_set1_epi32(*(int32_t *)from); + int32_t tmp; + zmemcpy_4(&tmp, from); + *chunk = _mm_set1_epi32(tmp); } static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { - *chunk = _mm_set1_epi64x(*(int64_t *)from); + int64_t tmp; + zmemcpy_8(&tmp, from); + *chunk = _mm_set1_epi64x(tmp); } static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { diff --git a/src/zlib-ng/arch/x86/chunkset_sse41.c b/src/zlib-ng/arch/x86/chunkset_sse41.c new file mode 100644 index 0000000..c6f9821 --- /dev/null +++ b/src/zlib-ng/arch/x86/chunkset_sse41.c @@ -0,0 +1,98 @@ +/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks. + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "zbuild.h" + +/* This requires SSE2 support. While it's implicit with SSE4, we can minimize + * code size by sharing the chunkcopy functions, which will certainly compile + * to identical machine code */ +#if defined(X86_SSE41) && defined(X86_SSE2) +#include <immintrin.h> +#include "chunk_permute_table.h" + +typedef __m128i chunk_t; + +#define CHUNK_SIZE 16 + +#define HAVE_CHUNKMEMSET_2 +#define HAVE_CHUNKMEMSET_4 +#define HAVE_CHUNKMEMSET_8 +#define HAVE_CHUNK_MAG +#define HAVE_CHUNKCOPY +#define HAVE_CHUNKUNROLL + +static const lut_rem_pair perm_idx_lut[13] = { + {0, 1}, /* 3 */ + {0, 0}, /* don't care */ + {1 * 32, 1}, /* 5 */ + {2 * 32, 4}, /* 6 */ + {3 * 32, 2}, /* 7 */ + {0 * 32, 0}, /* don't care */ + {4 * 32, 7}, /* 9 */ + {5 * 32, 6}, /* 10 */ + {6 * 32, 5}, /* 11 */ + {7 * 32, 4}, /* 12 */ + {8 * 32, 3}, /* 13 */ + {9 * 32, 2}, /* 14 */ + {10 * 32, 1},/* 15 */ +}; + + +static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { + int16_t tmp; + zmemcpy_2(&tmp, from); + *chunk = _mm_set1_epi16(tmp); +} + +static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { + int32_t tmp; + zmemcpy_4(&tmp, from); + *chunk = _mm_set1_epi32(tmp); +} + +static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { + int64_t tmp; + zmemcpy_8(&tmp, from); + *chunk = _mm_set1_epi64x(tmp); +} + +static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { + *chunk = _mm_loadu_si128((__m128i *)s); +} + +static inline void storechunk(uint8_t *out, chunk_t *chunk) { + _mm_storeu_si128((__m128i *)out, *chunk); +} + +static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) { + lut_rem_pair lut_rem = perm_idx_lut[dist - 3]; + __m128i perm_vec, ret_vec; +#ifdef Z_MEMORY_SANITIZER + /* Important to note: + * This is _not_ to subvert the memory sanitizer but to instead unpoison some + * bytes we willingly and purposefully load unitialized that we swizzle over + * in a vector register, anyway. If what we assume is wrong about what is used, + * the memory sanitizer will still usefully flag it */ + __msan_unpoison(buf + dist, 16 - dist); +#endif + ret_vec = _mm_loadu_si128((__m128i*)buf); + *chunk_rem = lut_rem.remval; + + perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx)); + ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec); + + return ret_vec; +} + +extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len); + +#define CHUNKSIZE chunksize_sse41 +#define CHUNKMEMSET chunkmemset_sse41 +#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41 +#define CHUNKCOPY(a, b, c) chunkcopy_sse2(a, b, c) +#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c) + +#include "chunkset_tpl.h" + +#endif diff --git a/src/zlib-ng/arch/x86/compare258_avx.c b/src/zlib-ng/arch/x86/compare256_avx2.c index d9108fd..1318a0e 100644 --- a/src/zlib-ng/arch/x86/compare258_avx.c +++ b/src/zlib-ng/arch/x86/compare256_avx2.c @@ -1,10 +1,9 @@ -/* compare258_avx.c -- AVX2 version of compare258 +/* compare256_avx2.c -- AVX2 version of compare256 * Copyright Mika T. Lindqvist <postmaster@raasu.org> * For conditions of distribution and use, see copyright notice in zlib.h */ #include "../../zbuild.h" -#include "../../zutil.h" #include "fallback_builtins.h" @@ -15,8 +14,7 @@ # include <nmmintrin.h> #endif -/* UNALIGNED_OK, AVX2 intrinsic comparison */ -static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { +static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { uint32_t len = 0; do { @@ -47,20 +45,18 @@ static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src return 256; } -static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_avx2_static(src0+2, src1+2) + 2; +Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { + return compare256_avx2_static(src0, src1); } -Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_avx2_static(src0, src1); -} +#define LONGEST_MATCH longest_match_avx2 +#define COMPARE256 compare256_avx2_static + +#include "match_tpl.h" -#define LONGEST_MATCH longest_match_unaligned_avx2 -#define COMPARE256 compare256_unaligned_avx2_static -#define COMPARE258 compare258_unaligned_avx2_static +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_avx2 +#define COMPARE256 compare256_avx2_static #include "match_tpl.h" diff --git a/src/zlib-ng/arch/x86/compare256_sse2.c b/src/zlib-ng/arch/x86/compare256_sse2.c new file mode 100644 index 0000000..aad4bd2 --- /dev/null +++ b/src/zlib-ng/arch/x86/compare256_sse2.c @@ -0,0 +1,96 @@ +/* compare256_sse2.c -- SSE2 version of compare256 + * Copyright Adam Stylinski <kungfujesus06@gmail.com> + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#include "../../zbuild.h" + +#include "fallback_builtins.h" + +#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) + +#include <emmintrin.h> + +static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { + uint32_t len = 0; + int align_offset = ((uintptr_t)src0) & 15; + const uint8_t *end0 = src0 + 256; + const uint8_t *end1 = src1 + 256; + __m128i xmm_src0, xmm_src1, xmm_cmp; + + /* Do the first load unaligned, than all subsequent ones we have at least + * one aligned load. Sadly aligning both loads is probably unrealistic */ + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + int align_adv = 16 - align_offset; + len += align_adv; + src0 += align_adv; + src1 += align_adv; + + /* Do a flooring division (should just be a shift right) */ + int num_iter = (256 - len) / 16; + + for (int i = 0; i < num_iter; ++i) { + xmm_src0 = _mm_load_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + /* Compiler _may_ turn this branch into a ptest + movemask, + * since a lot of those uops are shared and fused */ + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + + len += 16, src0 += 16, src1 += 16; + } + + if (align_offset) { + src0 = end0 - 16; + src1 = end1 - 16; + len = 256 - 16; + + xmm_src0 = _mm_loadu_si128((__m128i*)src0); + xmm_src1 = _mm_loadu_si128((__m128i*)src1); + xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); + + mask = (unsigned)_mm_movemask_epi8(xmm_cmp); + + if (mask != 0xFFFF) { + uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); + return len + match_byte; + } + } + + return 256; +} + +Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { + return compare256_sse2_static(src0, src1); +} + +#define LONGEST_MATCH longest_match_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#define LONGEST_MATCH_SLOW +#define LONGEST_MATCH longest_match_slow_sse2 +#define COMPARE256 compare256_sse2_static + +#include "match_tpl.h" + +#endif diff --git a/src/zlib-ng/arch/x86/compare258_sse.c b/src/zlib-ng/arch/x86/compare258_sse.c deleted file mode 100644 index 17534c0..0000000 --- a/src/zlib-ng/arch/x86/compare258_sse.c +++ /dev/null @@ -1,74 +0,0 @@ -/* compare258_sse.c -- SSE4.2 version of compare258 - * - * Copyright (C) 2013 Intel Corporation. All rights reserved. - * Authors: - * Wajdi Feghali <wajdi.k.feghali@intel.com> - * Jim Guilford <james.guilford@intel.com> - * Vinodh Gopal <vinodh.gopal@intel.com> - * Erdinc Ozturk <erdinc.ozturk@intel.com> - * Jim Kukunas <james.t.kukunas@linux.intel.com> - * - * Portions are Copyright (C) 2016 12Sided Technology, LLC. - * Author: - * Phil Vachon <pvachon@12sidedtech.com> - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#include "../../zbuild.h" -#include "../../zutil.h" - -#ifdef X86_SSE42_CMP_STR - -#include <immintrin.h> -#ifdef _MSC_VER -# include <nmmintrin.h> -#endif - -/* UNALIGNED_OK, SSE4.2 intrinsic comparison */ -static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { - uint32_t len = 0; - - do { - #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY - __m128i xmm_src0, xmm_src1; - uint32_t ret; - - xmm_src0 = _mm_loadu_si128((__m128i *)src0); - xmm_src1 = _mm_loadu_si128((__m128i *)src1); - ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - return len + ret; - } - src0 += 16, src1 += 16, len += 16; - - xmm_src0 = _mm_loadu_si128((__m128i *)src0); - xmm_src1 = _mm_loadu_si128((__m128i *)src1); - ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode); - if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) { - return len + ret; - } - src0 += 16, src1 += 16, len += 16; - } while (len < 256); - - return 256; -} - -static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) { - if (*(uint16_t *)src0 != *(uint16_t *)src1) - return (*src0 == *src1); - - return compare256_unaligned_sse4_static(src0+2, src1+2) + 2; -} - -Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) { - return compare258_unaligned_sse4_static(src0, src1); -} - -#define LONGEST_MATCH longest_match_unaligned_sse4 -#define COMPARE256 compare256_unaligned_sse4_static -#define COMPARE258 compare258_unaligned_sse4_static - -#include "match_tpl.h" - -#endif diff --git a/src/zlib-ng/arch/x86/crc_folding.c b/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq.c index 49cdc99..6bb2c98 100644 --- a/src/zlib-ng/arch/x86/crc_folding.c +++ b/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq.c @@ -3,9 +3,10 @@ * instruction. * * A white paper describing this algorithm can be found at: - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf * * Copyright (C) 2013 Intel Corporation. All rights reserved. + * Copyright (C) 2016 Marian Beermann (support for initial value) * Authors: * Wajdi Feghali <wajdi.k.feghali@intel.com> * Jim Guilford <james.guilford@intel.com> @@ -17,23 +18,26 @@ */ #ifdef X86_PCLMULQDQ_CRC - #include "../../zbuild.h" -#include <inttypes.h> + #include <immintrin.h> #include <wmmintrin.h> +#include <smmintrin.h> // _mm_extract_epi32 -#include "crc_folding.h" +#include "x86_features.h" +#include "cpu_features.h" -Z_INTERNAL void crc_fold_init(deflate_state *const s) { - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487)); - _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)s->crc0 + 2, _mm_setzero_si128()); - _mm_storeu_si128((__m128i *)s->crc0 + 3, _mm_setzero_si128()); +#include "../../crc32_fold.h" +#include "../../crc32_braid_p.h" +#include <assert.h> - s->strm->adler = 0; -} +#ifdef X86_VPCLMULQDQ_CRC +extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len); +extern size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc, + int32_t first); +#endif static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) { const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4, @@ -227,24 +231,45 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, *xmm_crc3 = _mm_castps_si128(ps_res); } -Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) { +static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) { + *fold0 = _mm_load_si128(fold + 0); + *fold1 = _mm_load_si128(fold + 1); + *fold2 = _mm_load_si128(fold + 2); + *fold3 = _mm_load_si128(fold + 3); +} + +static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) { + _mm_storeu_si128(fold + 0, fold0); + _mm_storeu_si128(fold + 1, fold1); + _mm_storeu_si128(fold + 2, fold2); + _mm_storeu_si128(fold + 3, fold3); +} + +static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) { + _mm_store_si128(fold + 4, foldp); +} + +Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) { + __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487); + __m128i xmm_zero = _mm_setzero_si128(); + crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero); + return 0; +} + +Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) { unsigned long algn_diff; __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; char ALIGNED_(16) partial_buf[16] = { 0 }; - /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3); - __m128i xmm_crc_part; + crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); if (len < 16) { if (len == 0) return; memcpy(partial_buf, src, len); - xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf); + xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf); memcpy(dst, partial_buf, len); goto partial; } @@ -263,20 +288,22 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const xmm_crc_part = _mm_setzero_si128(); } - while ((len -= 64) >= 0) { - /* CRC_LOAD */ - xmm_t0 = _mm_load_si128((__m128i *)src); - xmm_t1 = _mm_load_si128((__m128i *)src + 1); - xmm_t2 = _mm_load_si128((__m128i *)src + 2); - xmm_t3 = _mm_load_si128((__m128i *)src + 3); +#ifdef X86_VPCLMULQDQ_CRC + if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { + size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len); + + len -= n; + src += n; + dst += n; + } +#endif + + while (len >= 64) { + crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)dst, xmm_t0); - _mm_storeu_si128((__m128i *)dst + 1, xmm_t1); - _mm_storeu_si128((__m128i *)dst + 2, xmm_t2); - _mm_storeu_si128((__m128i *)dst + 3, xmm_t3); + crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3); xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); @@ -285,14 +312,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const src += 64; dst += 64; + len -= 64; } /* * len = num bytes left - 64 */ - if (len + 16 >= 0) { - len += 16; - + if (len >= 48) { xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); xmm_t2 = _mm_load_si128((__m128i *)src + 2); @@ -306,15 +332,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); - + len -= 48; if (len == 0) goto done; dst += 48; memcpy(&xmm_crc_part, (__m128i *)src + 3, len); - } else if (len + 32 >= 0) { - len += 32; - + } else if (len >= 32) { xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); @@ -326,14 +350,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + len -= 32; if (len == 0) goto done; dst += 32; memcpy(&xmm_crc_part, (__m128i *)src + 2, len); - } else if (len + 48 >= 0) { - len += 48; - + } else if (len >= 16) { xmm_t0 = _mm_load_si128((__m128i *)src); fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); @@ -342,13 +365,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + len -= 16; if (len == 0) goto done; dst += 16; memcpy(&xmm_crc_part, (__m128i *)src + 1, len); } else { - len += 64; if (len == 0) goto done; memcpy(&xmm_crc_part, src, len); @@ -360,12 +383,132 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const partial: partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); done: - /* CRC_SAVE */ - _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0); - _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1); - _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2); - _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3); - _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part); + crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); + crc32_fold_save_partial((__m128i *)crc->fold, xmm_crc_part); +} + +#define ONCE(op) if (first) { \ + first = 0; \ + (op); \ +} +#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial)) + +Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) { + unsigned long algn_diff; + __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3; + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part; + __m128i xmm_initial = _mm_cvtsi32_si128(init_crc); + xmm_crc_part = _mm_setzero_si128(); + int32_t first = init_crc != 0; + + /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 + * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to + * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which + * by definition can be up to 15 bytes + one full vector load. */ + assert(len >= 31 || first == 0); + crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + if (len < 16) { + goto partial_nocpy; + } + + algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF; + if (algn_diff) { + if (algn_diff >= 4 || init_crc == 0) { + xmm_crc_part = _mm_loadu_si128((__m128i *)src); + + src += algn_diff; + len -= algn_diff; + + XOR_INITIAL(xmm_crc_part); + partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); + } else { + xmm_t0 = _mm_loadu_si128((__m128i*)src); + xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1); + XOR_INITIAL(xmm_t0); + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); + + src += (algn_diff + 16); + len -= (algn_diff + 16); + } + + xmm_crc_part = _mm_setzero_si128(); + } + +#ifdef X86_VPCLMULQDQ_CRC + if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) { + size_t n = fold_16_vpclmulqdq_nocp(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, + xmm_initial, first); + first = 0; + + len -= n; + src += n; + } +#endif + + while (len >= 64) { + crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3); + XOR_INITIAL(xmm_t0); + fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0); + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3); + + src += 64; + len -= 64; + } + + /* + * len = num bytes left - 64 + */ + if (len >= 48) { + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + xmm_t2 = _mm_load_si128((__m128i *)src + 2); + XOR_INITIAL(xmm_t0); + + fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0); + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); + len -= 48; + src += 48; + } else if (len >= 32) { + xmm_t0 = _mm_load_si128((__m128i *)src); + xmm_t1 = _mm_load_si128((__m128i *)src + 1); + XOR_INITIAL(xmm_t0); + + fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0); + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1); + + len -= 32; + src += 32; + } else if (len >= 16) { + xmm_t0 = _mm_load_si128((__m128i *)src); + XOR_INITIAL(xmm_t0); + + fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); + + xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0); + + len -= 16; + src += 16; + } + +partial_nocpy: + if (len) { + memcpy(&xmm_crc_part, src, len); + partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part); + } + + crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3); } static const unsigned ALIGNED_(16) crc_k[] = { @@ -385,18 +528,13 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }; -uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) { +Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) { const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask); const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2); - - uint32_t crc; + __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3; __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold; - /* CRC_LOAD */ - __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0); - __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1); - __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2); - __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3); + crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3); /* * k1 @@ -450,8 +588,21 @@ uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) { xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1); - crc = (uint32_t)_mm_extract_epi32(xmm_crc3, 2); - return ~crc; + crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2)); + + return crc->value; +} + +uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) { + /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for + * these short lengths might also prove to be effective */ + if (len < 64) + return crc32_braid(crc32, buf, len); + + crc32_fold ALIGNED_(16) crc_state; + crc32_fold_reset_pclmulqdq(&crc_state); + crc32_fold_pclmulqdq(&crc_state, buf, len, crc32); + return crc32_fold_final_pclmulqdq(&crc_state); } #endif diff --git a/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c b/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c new file mode 100644 index 0000000..dfcdc8a --- /dev/null +++ b/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c @@ -0,0 +1,206 @@ +/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation. + * Copyright Wangyang Guo (wangyang.guo@intel.com) + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +#ifdef X86_VPCLMULQDQ_CRC +#include "../../zbuild.h" +#include "../../fallback_builtins.h" + +#include <immintrin.h> + +#define ONCE(op) if (first) { \ + first = 0; \ + (op); \ +} +#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial)) + +size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) { + size_t len_tmp = len; + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i z0, z1, z2, z3; + const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + // zmm register init + zmm_crc0 = _mm512_setzero_si512(); + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + + /* already have intermediate CRC in xmm registers + * fold4 with 4 xmm_crc to get zmm_crc0 + */ + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3); + len -= 256; + src += 256; + dst += 256; + + // fold-16 loops + while (len >= 256) { + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); + z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); + z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); + z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); + + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); + + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); + zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); + zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); + + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); + zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); + zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); + + _mm512_storeu_si512((__m512i *)dst, zmm_t0); + _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1); + _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2); + _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3); + len -= 256; + src += 256; + dst += 256; + } + // zmm_crc[0,1,2,3] -> zmm_crc0 + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); + + return (len_tmp - len); // return n bytes processed +} + +size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1, + __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, + __m128i init_crc, int32_t first) { + size_t len_tmp = len; + __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3; + __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3; + __m512i z0, z1, z2, z3; + __m512i zmm_initial = _mm512_zextsi128_si512(init_crc); + const __m512i zmm_fold4 = _mm512_set4_epi32( + 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596); + const __m512i zmm_fold16 = _mm512_set4_epi32( + 0x00000001, 0x1542778a, 0x00000001, 0x322d1430); + + // zmm register init + zmm_crc0 = _mm512_setzero_si512(); + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + XOR_INITIAL(zmm_t0); + zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3); + + /* already have intermediate CRC in xmm registers + * fold4 with 4 xmm_crc to get zmm_crc0 + */ + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2); + zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3); + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + + len -= 256; + src += 256; + + // fold-16 loops + while (len >= 256) { + zmm_t0 = _mm512_loadu_si512((__m512i *)src); + zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1); + zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2); + zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01); + z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01); + z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01); + z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01); + + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10); + zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10); + zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10); + zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10); + + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1); + zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2); + zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3); + + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0); + zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1); + zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2); + zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3); + + len -= 256; + src += 256; + } + // zmm_crc[0,1,2,3] -> zmm_crc0 + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2); + + z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01); + zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10); + zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0); + zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3); + + // zmm_crc0 -> xmm_crc[0, 1, 2, 3] + *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0); + *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1); + *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2); + *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3); + + return (len_tmp - len); // return n bytes processed +} +#endif diff --git a/src/zlib-ng/arch/x86/crc_folding.h b/src/zlib-ng/arch/x86/crc_folding.h deleted file mode 100644 index 0d3c24b..0000000 --- a/src/zlib-ng/arch/x86/crc_folding.h +++ /dev/null @@ -1,19 +0,0 @@ -/* crc_folding.h - * - * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ - * instruction. - * - * Copyright (C) 2013 Intel Corporation Jim Kukunas - * For conditions of distribution and use, see copyright notice in zlib.h - */ - -#ifndef CRC_FOLDING_H_ -#define CRC_FOLDING_H_ - -#include "../../deflate.h" - -Z_INTERNAL void crc_fold_init(deflate_state *const); -Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const); -Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long); - -#endif diff --git a/src/zlib-ng/arch/x86/insert_string_sse.c b/src/zlib-ng/arch/x86/insert_string_sse42.c index d0c316b..6fe4c81 100644 --- a/src/zlib-ng/arch/x86/insert_string_sse.c +++ b/src/zlib-ng/arch/x86/insert_string_sse42.c @@ -1,4 +1,4 @@ -/* insert_string_sse -- insert_string variant using SSE4.2's CRC instructions +/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions * * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h @@ -14,22 +14,22 @@ #ifdef X86_SSE42_CRC_INTRIN # ifdef _MSC_VER -# define UPDATE_HASH(s, h, val)\ +# define HASH_CALC(s, h, val)\ h = _mm_crc32_u32(h, val) # else -# define UPDATE_HASH(s, h, val)\ +# define HASH_CALC(s, h, val)\ h = __builtin_ia32_crc32si(h, val) # endif #else # ifdef _MSC_VER -# define UPDATE_HASH(s, h, val) {\ +# define HASH_CALC(s, h, val) {\ __asm mov edx, h\ __asm mov eax, val\ __asm crc32 eax, edx\ - __asm mov val, eax\ + __asm mov h, eax\ } # else -# define UPDATE_HASH(s, h, val) \ +# define HASH_CALC(s, h, val) \ __asm__ __volatile__ (\ "crc32 %1,%0\n\t"\ : "+r" (h)\ @@ -38,6 +38,10 @@ # endif #endif +#define HASH_CALC_VAR h +#define HASH_CALC_VAR_INIT uint32_t h = 0 + +#define UPDATE_HASH update_hash_sse4 #define INSERT_STRING insert_string_sse4 #define QUICK_INSERT_STRING quick_insert_string_sse4 diff --git a/src/zlib-ng/arch/x86/slide_avx.c b/src/zlib-ng/arch/x86/slide_hash_avx2.c index be9a9b7..94fe10c 100644 --- a/src/zlib-ng/arch/x86/slide_avx.c +++ b/src/zlib-ng/arch/x86/slide_hash_avx2.c @@ -14,34 +14,26 @@ #include <immintrin.h> -Z_INTERNAL void slide_hash_avx2(deflate_state *s) { - Pos *p; - unsigned n; - uint16_t wsize = (uint16_t)s->w_size; - const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); +static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) { + table += entries; + table -= 16; - n = HASH_SIZE; - p = &s->head[n] - 16; do { __m256i value, result; - value = _mm256_loadu_si256((__m256i *)p); - result= _mm256_subs_epu16(value, ymm_wsize); - _mm256_storeu_si256((__m256i *)p, result); - p -= 16; - n -= 16; - } while (n > 0); + value = _mm256_loadu_si256((__m256i *)table); + result = _mm256_subs_epu16(value, wsize); + _mm256_storeu_si256((__m256i *)table, result); - n = wsize; - p = &s->prev[n] - 16; - do { - __m256i value, result; + table -= 16; + entries -= 16; + } while (entries > 0); +} - value = _mm256_loadu_si256((__m256i *)p); - result= _mm256_subs_epu16(value, ymm_wsize); - _mm256_storeu_si256((__m256i *)p, result); +Z_INTERNAL void slide_hash_avx2(deflate_state *s) { + uint16_t wsize = (uint16_t)s->w_size; + const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize); - p -= 16; - n -= 16; - } while (n > 0); + slide_hash_chain(s->head, HASH_SIZE, ymm_wsize); + slide_hash_chain(s->prev, wsize, ymm_wsize); } diff --git a/src/zlib-ng/arch/x86/slide_hash_sse2.c b/src/zlib-ng/arch/x86/slide_hash_sse2.c new file mode 100644 index 0000000..5daac4a --- /dev/null +++ b/src/zlib-ng/arch/x86/slide_hash_sse2.c @@ -0,0 +1,62 @@ +/* + * SSE optimized hash slide + * + * Copyright (C) 2017 Intel Corporation + * Authors: + * Arjan van de Ven <arjan@linux.intel.com> + * Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * For conditions of distribution and use, see copyright notice in zlib.h + */ +#include "../../zbuild.h" +#include "../../deflate.h" + +#include <immintrin.h> +#include <assert.h> + +static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0, + uint32_t entries1, const __m128i wsize) { + uint32_t entries; + Pos *table; + __m128i value0, value1, result0, result1; + + int on_chain = 0; + +next_chain: + table = (on_chain) ? table1 : table0; + entries = (on_chain) ? entries1 : entries0; + + table += entries; + table -= 16; + + /* ZALLOC allocates this pointer unless the user chose a custom allocator. + * Our alloc function is aligned to 64 byte boundaries */ + do { + value0 = _mm_load_si128((__m128i *)table); + value1 = _mm_load_si128((__m128i *)(table + 8)); + result0 = _mm_subs_epu16(value0, wsize); + result1 = _mm_subs_epu16(value1, wsize); + _mm_store_si128((__m128i *)table, result0); + _mm_store_si128((__m128i *)(table + 8), result1); + + table -= 16; + entries -= 16; + } while (entries > 0); + + ++on_chain; + if (on_chain > 1) { + return; + } else { + goto next_chain; + } +} + +Z_INTERNAL void slide_hash_sse2(deflate_state *s) { + uint16_t wsize = (uint16_t)s->w_size; + const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); + + assert(((uintptr_t)s->head & 15) == 0); + assert(((uintptr_t)s->prev & 15) == 0); + + slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize); +} diff --git a/src/zlib-ng/arch/x86/slide_sse.c b/src/zlib-ng/arch/x86/slide_sse.c deleted file mode 100644 index abf4474..0000000 --- a/src/zlib-ng/arch/x86/slide_sse.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * SSE optimized hash slide - * - * Copyright (C) 2017 Intel Corporation - * Authors: - * Arjan van de Ven <arjan@linux.intel.com> - * Jim Kukunas <james.t.kukunas@linux.intel.com> - * - * For conditions of distribution and use, see copyright notice in zlib.h - */ -#include "../../zbuild.h" -#include "../../deflate.h" - -#include <immintrin.h> - -Z_INTERNAL void slide_hash_sse2(deflate_state *s) { - Pos *p; - unsigned n; - uint16_t wsize = (uint16_t)s->w_size; - const __m128i xmm_wsize = _mm_set1_epi16((short)wsize); - - n = HASH_SIZE; - p = &s->head[n] - 8; - do { - __m128i value, result; - - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); - p -= 8; - n -= 8; - } while (n > 0); - - n = wsize; - p = &s->prev[n] - 8; - do { - __m128i value, result; - - value = _mm_loadu_si128((__m128i *)p); - result= _mm_subs_epu16(value, xmm_wsize); - _mm_storeu_si128((__m128i *)p, result); - - p -= 8; - n -= 8; - } while (n > 0); -} diff --git a/src/zlib-ng/arch/x86/x86.c b/src/zlib-ng/arch/x86/x86_features.c index e782cb8..72ef885 100644 --- a/src/zlib-ng/arch/x86/x86.c +++ b/src/zlib-ng/arch/x86/x86_features.c @@ -1,5 +1,4 @@ -/* - * x86 feature check +/* x86_features.c - x86 feature check * * Copyright (C) 2013 Intel Corporation. All rights reserved. * Author: @@ -8,7 +7,7 @@ * For conditions of distribution and use, see copyright notice in zlib.h */ -#include "../../zutil.h" +#include "../../zbuild.h" #ifdef _MSC_VER # include <intrin.h> @@ -17,11 +16,17 @@ # include <cpuid.h> #endif +#include <string.h> + Z_INTERNAL int x86_cpu_has_avx2; +Z_INTERNAL int x86_cpu_has_avx512; +Z_INTERNAL int x86_cpu_has_avx512vnni; Z_INTERNAL int x86_cpu_has_sse2; Z_INTERNAL int x86_cpu_has_ssse3; +Z_INTERNAL int x86_cpu_has_sse41; Z_INTERNAL int x86_cpu_has_sse42; Z_INTERNAL int x86_cpu_has_pclmulqdq; +Z_INTERNAL int x86_cpu_has_vpclmulqdq; Z_INTERNAL int x86_cpu_has_tzcnt; static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) { @@ -57,11 +62,11 @@ void Z_INTERNAL x86_check_features(void) { unsigned maxbasic; cpuid(0, &maxbasic, &ebx, &ecx, &edx); - cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx); x86_cpu_has_sse2 = edx & 0x4000000; x86_cpu_has_ssse3 = ecx & 0x200; + x86_cpu_has_sse41 = ecx & 0x80000; x86_cpu_has_sse42 = ecx & 0x100000; x86_cpu_has_pclmulqdq = ecx & 0x2; @@ -73,8 +78,12 @@ void Z_INTERNAL x86_check_features(void) { x86_cpu_has_tzcnt = ebx & 0x8; // check AVX2 bit x86_cpu_has_avx2 = ebx & 0x20; + x86_cpu_has_avx512 = ebx & 0x00010000; + x86_cpu_has_avx512vnni = ecx & 0x800; + x86_cpu_has_vpclmulqdq = ecx & 0x400; } else { x86_cpu_has_tzcnt = 0; x86_cpu_has_avx2 = 0; + x86_cpu_has_vpclmulqdq = 0; } } diff --git a/src/zlib-ng/arch/x86/x86.h b/src/zlib-ng/arch/x86/x86_features.h index 8471e15..97630ab 100644 --- a/src/zlib-ng/arch/x86/x86.h +++ b/src/zlib-ng/arch/x86/x86_features.h @@ -1,16 +1,20 @@ -/* cpu.h -- check for CPU features +/* x86_features.h -- check for CPU features * Copyright (C) 2013 Intel Corporation Jim Kukunas * For conditions of distribution and use, see copyright notice in zlib.h */ -#ifndef CPU_H_ -#define CPU_H_ +#ifndef X86_FEATURES_H_ +#define X86_FEATURES_H_ extern int x86_cpu_has_avx2; +extern int x86_cpu_has_avx512; +extern int x86_cpu_has_avx512vnni; extern int x86_cpu_has_sse2; extern int x86_cpu_has_ssse3; +extern int x86_cpu_has_sse41; extern int x86_cpu_has_sse42; extern int x86_cpu_has_pclmulqdq; +extern int x86_cpu_has_vpclmulqdq; extern int x86_cpu_has_tzcnt; void Z_INTERNAL x86_check_features(void); |