aboutsummaryrefslogtreecommitdiff
path: root/src/zlib-ng/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'src/zlib-ng/arch/x86')
-rw-r--r--src/zlib-ng/arch/x86/INDEX.md8
-rw-r--r--src/zlib-ng/arch/x86/Makefile.in137
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx.c117
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx2.c17
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx2_p.h32
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx2_tpl.h140
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx512.c16
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx512_p.h46
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx512_tpl.h106
-rw-r--r--src/zlib-ng/arch/x86/adler32_avx512_vnni.c225
-rw-r--r--src/zlib-ng/arch/x86/adler32_sse42.c121
-rw-r--r--src/zlib-ng/arch/x86/adler32_ssse3.c194
-rw-r--r--src/zlib-ng/arch/x86/adler32_ssse3_p.h29
-rw-r--r--src/zlib-ng/arch/x86/chunk_permute_table.h53
-rw-r--r--src/zlib-ng/arch/x86/chunkset_avx.c98
-rw-r--r--src/zlib-ng/arch/x86/chunkset_sse2.c (renamed from src/zlib-ng/arch/x86/chunkset_sse.c)22
-rw-r--r--src/zlib-ng/arch/x86/chunkset_sse41.c98
-rw-r--r--src/zlib-ng/arch/x86/compare256_avx2.c (renamed from src/zlib-ng/arch/x86/compare258_avx.c)26
-rw-r--r--src/zlib-ng/arch/x86/compare256_sse2.c96
-rw-r--r--src/zlib-ng/arch/x86/compare258_sse.c74
-rw-r--r--src/zlib-ng/arch/x86/crc32_fold_pclmulqdq.c (renamed from src/zlib-ng/arch/x86/crc_folding.c)267
-rw-r--r--src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c206
-rw-r--r--src/zlib-ng/arch/x86/crc_folding.h19
-rw-r--r--src/zlib-ng/arch/x86/insert_string_sse42.c (renamed from src/zlib-ng/arch/x86/insert_string_sse.c)16
-rw-r--r--src/zlib-ng/arch/x86/slide_hash_avx2.c (renamed from src/zlib-ng/arch/x86/slide_avx.c)38
-rw-r--r--src/zlib-ng/arch/x86/slide_hash_sse2.c62
-rw-r--r--src/zlib-ng/arch/x86/slide_sse.c46
-rw-r--r--src/zlib-ng/arch/x86/x86_features.c (renamed from src/zlib-ng/arch/x86/x86.c)17
-rw-r--r--src/zlib-ng/arch/x86/x86_features.h (renamed from src/zlib-ng/arch/x86/x86.h)10
29 files changed, 1818 insertions, 518 deletions
diff --git a/src/zlib-ng/arch/x86/INDEX.md b/src/zlib-ng/arch/x86/INDEX.md
deleted file mode 100644
index 8bf6d08..0000000
--- a/src/zlib-ng/arch/x86/INDEX.md
+++ /dev/null
@@ -1,8 +0,0 @@
-Contents
---------
-
-|Name|Description|
-|:-|:-|
-|deflate_quick.c|SSE4 optimized deflate strategy for use as level 1|
-|crc_folding.c|SSE4 + PCLMULQDQ optimized CRC folding implementation|
-|slide_sse2.c|SSE2 optimized slide_hash|
diff --git a/src/zlib-ng/arch/x86/Makefile.in b/src/zlib-ng/arch/x86/Makefile.in
index 13c736c..f9aedf8 100644
--- a/src/zlib-ng/arch/x86/Makefile.in
+++ b/src/zlib-ng/arch/x86/Makefile.in
@@ -8,11 +8,15 @@ SFLAGS=
INCLUDES=
SUFFIX=
+AVX512FLAG=-mavx512f -mavx512dq -mavx512vl -mavx512bw
+AVX512VNNIFLAG=-mavx512vnni
AVX2FLAG=-mavx2
SSE2FLAG=-msse2
SSSE3FLAG=-mssse3
-SSE4FLAG=-msse4
+SSE41FLAG=-msse4.1
+SSE42FLAG=-msse4.2
PCLMULFLAG=-mpclmul
+VPCLMULFLAG=-mvpclmulqdq
NOLTOFLAG=
SRCDIR=.
@@ -20,23 +24,28 @@ SRCTOP=../..
TOPDIR=$(SRCTOP)
all: \
- x86.o x86.lo \
- adler32_avx.o adler32.lo \
+ x86_features.o x86_features.lo \
+ adler32_avx2.o adler32_avx2.lo \
+ adler32_avx512.o adler32_avx512.lo \
+ adler32_avx512_vnni.o adler32_avx512_vnni.lo \
+ adler32_sse42.o adler32_sse42.lo \
adler32_ssse3.o adler32_ssse3.lo \
chunkset_avx.o chunkset_avx.lo \
- chunkset_sse.o chunkset_sse.lo \
- compare258_avx.o compare258_avx.lo \
- compare258_sse.o compare258_sse.lo \
- insert_string_sse.o insert_string_sse.lo \
- crc_folding.o crc_folding.lo \
- slide_avx.o slide_avx.lo \
- slide_sse.o slide_sse.lo
-
-x86.o:
- $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
-
-x86.lo:
- $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86.c
+ chunkset_sse2.o chunkset_sse2.lo \
+ chunkset_sse41.o chunkset_sse41.lo \
+ compare256_avx2.o compare256_avx2.lo \
+ compare256_sse2.o compare256_sse2.lo \
+ insert_string_sse42.o insert_string_sse42.lo \
+ crc32_fold_pclmulqdq.o crc32_fold_pclmulqdq.lo \
+ crc32_fold_vpclmulqdq.o crc32_fold_vpclmulqdq.lo \
+ slide_hash_avx2.o slide_hash_avx2.lo \
+ slide_hash_sse2.o slide_hash_sse2.lo
+
+x86_features.o:
+ $(CC) $(CFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
+
+x86_features.lo:
+ $(CC) $(SFLAGS) $(INCLUDES) -c -o $@ $(SRCDIR)/x86_features.c
chunkset_avx.o:
$(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
@@ -44,59 +53,89 @@ chunkset_avx.o:
chunkset_avx.lo:
$(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_avx.c
-chunkset_sse.o:
- $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
+chunkset_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse2.c
+
+chunkset_sse41.o:
+ $(CC) $(CFLAGS) $(SSE41FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
+chunkset_sse41.lo:
+ $(CC) $(SFLAGS) $(SSE41FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse41.c
+
+compare256_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
-chunkset_sse.lo:
- $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/chunkset_sse.c
+compare256_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_avx2.c
-compare258_avx.o:
- $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+compare256_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
-compare258_avx.lo:
- $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_avx.c
+compare256_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare256_sse2.c
-compare258_sse.o:
- $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+insert_string_sse42.o:
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-compare258_sse.lo:
- $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/compare258_sse.c
+insert_string_sse42.lo:
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse42.c
-insert_string_sse.o:
- $(CC) $(CFLAGS) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+crc32_fold_pclmulqdq.o:
+ $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
-insert_string_sse.lo:
- $(CC) $(SFLAGS) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/insert_string_sse.c
+crc32_fold_pclmulqdq.lo:
+ $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_pclmulqdq.c
-crc_folding.o:
- $(CC) $(CFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+crc32_fold_vpclmulqdq.o:
+ $(CC) $(CFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
-crc_folding.lo:
- $(CC) $(SFLAGS) $(PCLMULFLAG) $(SSE4FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc_folding.c
+crc32_fold_vpclmulqdq.lo:
+ $(CC) $(SFLAGS) $(VPCLMULFLAG) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/crc32_fold_vpclmulqdq.c
-slide_avx.o:
- $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+slide_hash_avx2.o:
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
-slide_avx.lo:
- $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_avx.c
+slide_hash_avx2.lo:
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_avx2.c
-slide_sse.o:
- $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+slide_hash_sse2.o:
+ $(CC) $(CFLAGS) $(SSE2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
-slide_sse.lo:
- $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_sse.c
+slide_hash_sse2.lo:
+ $(CC) $(SFLAGS) $(SSE2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/slide_hash_sse2.c
-adler32_avx.o: $(SRCDIR)/adler32_avx.c
- $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+adler32_avx2.o: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(CFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
-adler32_avx.lo: $(SRCDIR)/adler32_avx.c
- $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx.c
+adler32_avx2.lo: $(SRCDIR)/adler32_avx2.c
+ $(CC) $(SFLAGS) $(AVX2FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx2.c
+
+adler32_avx512.o: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(CFLAGS) $(AVX512FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512.lo: $(SRCDIR)/adler32_avx512.c
+ $(CC) $(SFLAGS) $(AVX512FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512.c
+
+adler32_avx512_vnni.o: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(CFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
+
+adler32_avx512_vnni.lo: $(SRCDIR)/adler32_avx512_vnni.c
+ $(CC) $(SFLAGS) $(AVX512VNNIFLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_avx512_vnni.c
adler32_ssse3.o: $(SRCDIR)/adler32_ssse3.c
$(CC) $(CFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
adler32_ssse3.lo: $(SRCDIR)/adler32_ssse3.c
- $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+ $(CC) $(SFLAGS) $(SSSE3FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_ssse3.c
+
+adler32_sse42.o: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(CFLAGS) $(SSE42FLAG) $(NOLTOFLAG) $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
+
+adler32_sse42.lo: $(SRCDIR)/adler32_sse42.c
+ $(CC) $(SFLAGS) $(SSE42FLAG) $(NOLTOFLAG) -DPIC $(INCLUDES) -c -o $@ $(SRCDIR)/adler32_sse42.c
mostlyclean: clean
clean:
diff --git a/src/zlib-ng/arch/x86/adler32_avx.c b/src/zlib-ng/arch/x86/adler32_avx.c
deleted file mode 100644
index 1063246..0000000
--- a/src/zlib-ng/arch/x86/adler32_avx.c
+++ /dev/null
@@ -1,117 +0,0 @@
-/* adler32.c -- compute the Adler-32 checksum of a data stream
- * Copyright (C) 1995-2011 Mark Adler
- * Authors:
- * Brian Bockelman <bockelman@gmail.com>
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-#include "../../zutil.h"
-
-#include "../../adler32_p.h"
-
-#include <immintrin.h>
-
-#ifdef X86_AVX2_ADLER32
-
-Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_t len) {
- uint32_t sum2;
-
- /* split Adler-32 into component sums */
- sum2 = (adler >> 16) & 0xffff;
- adler &= 0xffff;
-
- /* in case user likes doing a byte at a time, keep it fast */
- if (UNLIKELY(len == 1))
- return adler32_len_1(adler, buf, sum2);
-
- /* initial Adler-32 value (deferred check for len == 1 speed) */
- if (UNLIKELY(buf == NULL))
- return 1L;
-
- /* in case short lengths are provided, keep it somewhat fast */
- if (UNLIKELY(len < 16))
- return adler32_len_16(adler, buf, len, sum2);
-
- uint32_t ALIGNED_(32) s1[8], s2[8];
-
- memset(s1, 0, sizeof(s1)); s1[7] = adler; // TODO: would a masked load be faster?
- memset(s2, 0, sizeof(s2)); s2[7] = sum2;
-
- char ALIGNED_(32) dot1[32] = \
- {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m256i dot1v = _mm256_load_si256((__m256i*)dot1);
- char ALIGNED_(32) dot2[32] = \
- {32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
- 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- __m256i dot2v = _mm256_load_si256((__m256i*)dot2);
- short ALIGNED_(32) dot3[16] = \
- {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m256i dot3v = _mm256_load_si256((__m256i*)dot3);
-
- // We will need to multiply by
- char ALIGNED_(32) shift[16] = {5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- __m128i shiftv = _mm_load_si128((__m128i*)shift);
-
- while (len >= 32) {
- __m256i vs1 = _mm256_load_si256((__m256i*)s1);
- __m256i vs2 = _mm256_load_si256((__m256i*)s2);
- __m256i vs1_0 = vs1;
-
- int k = (len < NMAX ? (int)len : NMAX);
- k -= k % 32;
- len -= k;
-
- while (k >= 32) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
- */
- __m256i vbuf = _mm256_loadu_si256((__m256i*)buf);
- buf += 32;
- k -= 32;
-
- __m256i v_short_sum1 = _mm256_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
- __m256i vsum1 = _mm256_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
- __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm256_add_epi32(vsum1, vs1);
- __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v);
- vs1_0 = _mm256_sll_epi32(vs1_0, shiftv);
- vsum2 = _mm256_add_epi32(vsum2, vs2);
- vs2 = _mm256_add_epi32(vsum2, vs1_0);
- vs1_0 = vs1;
- }
-
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
- uint32_t ALIGNED_(32) s1_unpack[8];
- uint32_t ALIGNED_(32) s2_unpack[8];
-
- _mm256_store_si256((__m256i*)s1_unpack, vs1);
- _mm256_store_si256((__m256i*)s2_unpack, vs2);
-
- adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
- (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
- adler %= BASE;
- s1[7] = adler;
-
- sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE) +
- (s2_unpack[4] % BASE) + (s2_unpack[5] % BASE) + (s2_unpack[6] % BASE) + (s2_unpack[7] % BASE);
- sum2 %= BASE;
- s2[7] = sum2;
- }
-
- while (len) {
- len--;
- adler += *buf++;
- sum2 += adler;
- }
- adler %= BASE;
- sum2 %= BASE;
-
- /* return recombined sums */
- return adler | (sum2 << 16);
-}
-
-#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx2.c b/src/zlib-ng/arch/x86/adler32_avx2.c
new file mode 100644
index 0000000..dcd1166
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx2.c
@@ -0,0 +1,17 @@
+/* adler32_avx2.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include <immintrin.h>
+
+#ifdef X86_AVX2_ADLER32
+
+#include "adler32_avx2_tpl.h"
+
+#define COPY
+#include "adler32_avx2_tpl.h"
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx2_p.h b/src/zlib-ng/arch/x86/adler32_avx2_p.h
new file mode 100644
index 0000000..f7079bf
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx2_p.h
@@ -0,0 +1,32 @@
+/* adler32_avx2_p.h -- adler32 avx2 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_AVX2_P_H_
+#define ADLER32_AVX2_P_H_
+
+#if defined(X86_AVX2_ADLER32) || defined(X86_AVX512VNNI_ADLER32)
+
+/* 32 bit horizontal sum, adapted from Agner Fog's vector library. */
+static inline uint32_t hsum256(__m256i x) {
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(x, 1),
+ _mm256_castsi256_si128(x));
+ __m128i sum2 = _mm_add_epi32(sum1, _mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+static inline uint32_t partial_hsum256(__m256i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros */
+ const __m256i perm_vec = _mm256_setr_epi32(0, 2, 4, 6, 1, 1, 1, 1);
+ __m256i non_zero = _mm256_permutevar8x32_epi32(x, perm_vec);
+ __m128i non_zero_sse = _mm256_castsi256_si128(non_zero);
+ __m128i sum2 = _mm_add_epi32(non_zero_sse,_mm_unpackhi_epi64(non_zero_sse, non_zero_sse));
+ __m128i sum3 = _mm_add_epi32(sum2, _mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+#endif
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx2_tpl.h b/src/zlib-ng/arch/x86/adler32_avx2_tpl.h
new file mode 100644
index 0000000..59cacfa
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx2_tpl.h
@@ -0,0 +1,140 @@
+/* adler32_avx2_tpl.h -- adler32 avx2 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "../../adler32_p.h"
+#include "../../fallback_builtins.h"
+#include "adler32_avx2_p.h"
+
+#ifdef X86_SSE42_ADLER32
+extern uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len);
+extern uint32_t adler32_ssse3(uint32_t adler, const uint8_t *src, size_t len);
+#define copy_sub32(a, b, c, d) adler32_fold_copy_sse42(a, b, c, d)
+#define sub32(a, b, c) adler32_ssse3(a, b, c)
+#else
+#define copy_sub32(a, b, c, d) adler32_copy_len_16(adler0, c, b, d, adler1)
+#define sub32(a, b, c) adler32_len_16(adler0, b, c, adler1)
+#endif
+
+#ifdef COPY
+Z_INTERNAL uint32_t adler32_fold_copy_avx2(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const uint8_t *src, size_t len) {
+#endif
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+#ifdef COPY
+ return adler32_copy_len_16(adler0, src, dst, len, adler1);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+ } else if (len < 32) {
+#ifdef COPY
+ return copy_sub32(adler, dst, src, len);
+#else
+ return sub32(adler, src, len);
+#endif
+ }
+
+ __m256i vs1, vs2;
+
+ const __m256i dot2v = _mm256_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+ 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m256i dot3v = _mm256_set1_epi16(1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+
+ size_t k = MIN(len, NMAX);
+ k -= k % 32;
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 32 vs1 + sum( (32-i+1) c[i] )
+ */
+ __m256i vbuf = _mm256_loadu_si256((__m256i*)src);
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf, zero); // Sum of abs diff, resulting in 2 x int32's
+ //
+#ifdef COPY
+ _mm256_storeu_si256((__m256i*)dst, vbuf);
+ dst += 32;
+#endif
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ __m256i v_short_sum2 = _mm256_maddubs_epi16(vbuf, dot2v); // sum 32 uint8s to 16 shorts
+ __m256i vsum2 = _mm256_madd_epi16(v_short_sum2, dot3v); // sum 16 shorts to 8 uint32s
+ vs2 = _mm256_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ /* Defer the multiplication with 32 to outside of the loop */
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+
+ /* The compiler is generating the following sequence for this integer modulus
+ * when done the scalar way, in GPRs:
+
+ adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
+ (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
+
+ mov $0x80078071,%edi // move magic constant into 32 bit register %edi
+ ...
+ vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
+ mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
+ imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
+ shr $0x2f,%rsi // shift right by 47
+ imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+ sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
+ ...
+ // repeats for each element with vpextract instructions
+
+ This is tricky with AVX2 for a number of reasons:
+ 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
+ 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
+ back down to 32 bit precision later (there is in AVX512)
+ 3.) Full width integer multiplications aren't cheap
+
+ We can, however, and do a relatively cheap sequence for horizontal sums.
+ Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
+ previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
+ that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
+ performed on the maximum possible inputs before overflow
+ */
+
+
+ /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
+ * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
+ * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
+ * what the compiler is doing to avoid integer divisions. */
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
diff --git a/src/zlib-ng/arch/x86/adler32_avx512.c b/src/zlib-ng/arch/x86/adler32_avx512.c
new file mode 100644
index 0000000..c0bf072
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx512.c
@@ -0,0 +1,16 @@
+/* adler32_avx512.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512_ADLER32
+
+#include "adler32_avx512_tpl.h"
+
+#define COPY
+#include "adler32_avx512_tpl.h"
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx512_p.h b/src/zlib-ng/arch/x86/adler32_avx512_p.h
new file mode 100644
index 0000000..5b79d2a
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx512_p.h
@@ -0,0 +1,46 @@
+#ifndef AVX512_FUNCS_H
+#define AVX512_FUNCS_H
+
+#include <immintrin.h>
+#include <stdint.h>
+/* Written because *_add_epi32(a) sets off ubsan */
+static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
+ __m256i a = _mm512_extracti64x4_epi64(x, 1);
+ __m256i b = _mm512_extracti64x4_epi64(x, 0);
+
+ __m256i a_plus_b = _mm256_add_epi32(a, b);
+ __m128i c = _mm256_extracti128_si256(a_plus_b, 1);
+ __m128i d = _mm256_extracti128_si256(a_plus_b, 0);
+ __m128i c_plus_d = _mm_add_epi32(c, d);
+
+ __m128i sum1 = _mm_unpackhi_epi64(c_plus_d, c_plus_d);
+ __m128i sum2 = _mm_add_epi32(sum1, c_plus_d);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+
+ return _mm_cvtsi128_si32(sum4);
+}
+
+static inline uint32_t partial_hsum(__m512i x) {
+ /* We need a permutation vector to extract every other integer. The
+ * rest are going to be zeros. Marking this const so the compiler stands
+ * a better chance of keeping this resident in a register through entire
+ * loop execution. We certainly have enough zmm registers (32) */
+ const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 1, 1, 1, 1, 1, 1, 1);
+
+ __m512i non_zero = _mm512_permutexvar_epi32(perm_vec, x);
+
+ /* From here, it's a simple 256 bit wide reduction sum */
+ __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
+
+ /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
+ * pretty slow, much slower than the longer instruction sequence below */
+ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
+ _mm256_castsi256_si128(non_zero_avx));
+ __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1, sum1));
+ __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2, 1));
+ return (uint32_t)_mm_cvtsi128_si32(sum3);
+}
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx512_tpl.h b/src/zlib-ng/arch/x86/adler32_avx512_tpl.h
new file mode 100644
index 0000000..d324ce9
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx512_tpl.h
@@ -0,0 +1,106 @@
+/* adler32_avx512_tpl.h -- adler32 avx512 vectorized function templates
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "../../cpu_features.h"
+#include "../../fallback_builtins.h"
+#include <immintrin.h>
+#include "adler32_avx512_p.h"
+
+#ifdef X86_AVX512_ADLER32
+
+#ifdef COPY
+Z_INTERNAL uint32_t adler32_fold_copy_avx512(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+#else
+Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const uint8_t *src, size_t len) {
+#endif
+
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 64) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+#ifdef COPY
+ __mmask64 storemask = (0xFFFFFFFFFFFFFFFFUL >> (64 - len));
+ __m512i copy_vec = _mm512_maskz_loadu_epi8(storemask, src);
+ _mm512_mask_storeu_epi8(dst, storemask, copy_vec);
+#endif
+
+#ifdef X86_AVX2_ADLER32
+ return adler32_avx2(adler, src, len);
+#elif defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+ }
+
+ __m512i vbuf, vs1_0, vs3;
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+ const __m512i dot3v = _mm512_set1_epi16(1);
+ const __m512i zero = _mm512_setzero_si512();
+ size_t k;
+
+ while (len >= 64) {
+ __m512i vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ __m512i vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ vs1_0 = vs1;
+ vs3 = _mm512_setzero_si512();
+
+ k = MIN(len, NMAX);
+ k -= k % 64;
+ len -= k;
+
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf = _mm512_loadu_si512(src);
+#ifdef COPY
+ _mm512_storeu_si512(dst, vbuf);
+ dst += 64;
+#endif
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf, zero);
+ __m512i v_short_sum2 = _mm512_maddubs_epi16(vbuf, dot2v);
+ vs1 = _mm512_add_epi32(vs1_sad, vs1);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ __m512i vsum2 = _mm512_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm512_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_avx512_vnni.c b/src/zlib-ng/arch/x86/adler32_avx512_vnni.c
new file mode 100644
index 0000000..330bfe3
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_avx512_vnni.c
@@ -0,0 +1,225 @@
+/* adler32_avx512_vnni.c -- compute the Adler-32 checksum of a data stream
+ * Based on Brian Bockelman's AVX2 version
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_AVX512VNNI_ADLER32
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../cpu_features.h"
+#include "../../fallback_builtins.h"
+#include <immintrin.h>
+#include "../../adler32_fold.h"
+#include "adler32_avx512_p.h"
+#include "adler32_avx2_p.h"
+
+Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const uint8_t *src, size_t len) {
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 32)
+#if defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+ if (len < 64)
+#ifdef X86_AVX2_ADLER32
+ return adler32_avx2(adler, src, len);
+#elif defined(X86_SSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+
+ const __m512i dot2v = _mm512_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64);
+
+ const __m512i zero = _mm512_setzero_si512();
+ __m512i vs1, vs2;
+
+ while (len >= 64) {
+ vs1 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm512_zextsi128_si512(_mm_cvtsi32_si128(adler1));
+ size_t k = MIN(len, NMAX);
+ k -= k % 64;
+ len -= k;
+ __m512i vs1_0 = vs1;
+ __m512i vs3 = _mm512_setzero_si512();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m512i vs2_1 = _mm512_setzero_si512();
+ __m512i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 128) {
+ vbuf1 = _mm512_loadu_si512((__m512i*)src);
+
+ src += 64;
+ k -= 64;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 128) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm512_loadu_si512((__m512i*)src);
+ vbuf1 = _mm512_loadu_si512((__m512i*)(src + 64));
+ src += 128;
+ k -= 128;
+
+ __m512i vs1_sad = _mm512_sad_epu8(vbuf0, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs3 = _mm512_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm512_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm512_add_epi32(vs3, vs1);
+ vs1_sad = _mm512_sad_epu8(vbuf1, zero);
+ vs1 = _mm512_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm512_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm512_slli_epi32(vs3, 6);
+ vs2 = _mm512_add_epi32(vs2, vs3);
+ vs2 = _mm512_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = _mm512_reduce_add_epu32(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler;
+}
+
+Z_INTERNAL uint32_t adler32_fold_copy_avx512_vnni(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ if (src == NULL) return 1L;
+ if (len == 0) return adler;
+
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel_copy:
+ if (len < 32) {
+ /* This handles the remaining copies, just call normal adler checksum after this */
+ __mmask32 storemask = (0xFFFFFFFFUL >> (32 - len));
+ __m256i copy_vec = _mm256_maskz_loadu_epi8(storemask, src);
+ _mm256_mask_storeu_epi8(dst, storemask, copy_vec);
+
+#if defined(X86_SSSE3_ADLER32)
+ return adler32_ssse3(adler, src, len);
+#else
+ return adler32_len_16(adler0, src, len, adler1);
+#endif
+ }
+
+ const __m256i dot2v = _mm256_set_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32);
+
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i vs1, vs2;
+
+ while (len >= 32) {
+ vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0));
+ vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1));
+ size_t k = MIN(len, NMAX);
+ k -= k % 32;
+ len -= k;
+ __m256i vs1_0 = vs1;
+ __m256i vs3 = _mm256_setzero_si256();
+ /* We might get a tad bit more ILP here if we sum to a second register in the loop */
+ __m256i vs2_1 = _mm256_setzero_si256();
+ __m256i vbuf0, vbuf1;
+
+ /* Remainder peeling */
+ if (k % 64) {
+ vbuf1 = _mm256_loadu_si256((__m256i*)src);
+ _mm256_storeu_si256((__m256i*)dst, vbuf1);
+ dst += 32;
+
+ src += 32;
+ k -= 32;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ /* Manually unrolled this loop by 2 for an decent amount of ILP */
+ while (k >= 64) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 64 vs1 + sum( (64-i+1) c[i] )
+ */
+ vbuf0 = _mm256_loadu_si256((__m256i*)src);
+ vbuf1 = _mm256_loadu_si256((__m256i*)(src + 32));
+ _mm256_storeu_si256((__m256i*)dst, vbuf0);
+ _mm256_storeu_si256((__m256i*)(dst + 32), vbuf1);
+ dst += 64;
+ src += 64;
+ k -= 64;
+
+ __m256i vs1_sad = _mm256_sad_epu8(vbuf0, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs3 = _mm256_add_epi32(vs3, vs1_0);
+ /* multiply-add, resulting in 16 ints. Fuse with sum stage from prior versions, as we now have the dp
+ * instructions to eliminate them */
+ vs2 = _mm256_dpbusd_epi32(vs2, vbuf0, dot2v);
+
+ vs3 = _mm256_add_epi32(vs3, vs1);
+ vs1_sad = _mm256_sad_epu8(vbuf1, zero);
+ vs1 = _mm256_add_epi32(vs1, vs1_sad);
+ vs2_1 = _mm256_dpbusd_epi32(vs2_1, vbuf1, dot2v);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm256_slli_epi32(vs3, 5);
+ vs2 = _mm256_add_epi32(vs2, vs3);
+ vs2 = _mm256_add_epi32(vs2, vs2_1);
+
+ adler0 = partial_hsum256(vs1) % BASE;
+ adler1 = hsum256(vs2) % BASE;
+ }
+
+ adler = adler0 | (adler1 << 16);
+
+ /* Process tail (len < 64). */
+ if (len) {
+ goto rem_peel_copy;
+ }
+
+ return adler;
+}
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_sse42.c b/src/zlib-ng/arch/x86/adler32_sse42.c
new file mode 100644
index 0000000..92efe4d
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_sse42.c
@@ -0,0 +1,121 @@
+/* adler32_sse42.c -- compute the Adler-32 checksum of a data stream
+ * Copyright (C) 1995-2011 Mark Adler
+ * Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
+ * Brian Bockelman <bockelman@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+#include "../../adler32_p.h"
+#include "../../adler32_fold.h"
+#include "adler32_ssse3_p.h"
+#include <immintrin.h>
+
+#ifdef X86_SSE42_ADLER32
+
+Z_INTERNAL uint32_t adler32_fold_copy_sse42(uint32_t adler, uint8_t *dst, const uint8_t *src, size_t len) {
+ uint32_t adler0, adler1;
+ adler1 = (adler >> 16) & 0xffff;
+ adler0 = adler & 0xffff;
+
+rem_peel:
+ if (len < 16) {
+ return adler32_copy_len_16(adler0, src, dst, len, adler1);
+ }
+
+ __m128i vbuf, vbuf_0;
+ __m128i vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ v_sad_sum2, vsum2, vsum2_0;
+ __m128i zero = _mm_setzero_si128();
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ size_t k;
+
+ while (len >= 16) {
+
+ k = MIN(len, NMAX);
+ k -= k % 16;
+ len -= k;
+
+ vs1 = _mm_cvtsi32_si128(adler0);
+ vs2 = _mm_cvtsi32_si128(adler1);
+
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ vbuf_0 = _mm_loadu_si128((__m128i*)(src + 16));
+ src += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ _mm_storeu_si128((__m128i*)(dst + 16), vbuf_0);
+ dst += 32;
+
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_loadu_si128((__m128i*)src);
+ src += 16;
+ k -= 16;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+
+ _mm_storeu_si128((__m128i*)dst, vbuf);
+ dst += 16;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ adler0 = partial_hsum(vs1) % BASE;
+ adler1 = hsum(vs2) % BASE;
+ }
+
+ /* If this is true, there's fewer than 16 elements remaining */
+ if (len) {
+ goto rem_peel;
+ }
+
+ return adler0 | (adler1 << 16);
+}
+
+#endif
diff --git a/src/zlib-ng/arch/x86/adler32_ssse3.c b/src/zlib-ng/arch/x86/adler32_ssse3.c
index 101df4f..8c55bad 100644
--- a/src/zlib-ng/arch/x86/adler32_ssse3.c
+++ b/src/zlib-ng/arch/x86/adler32_ssse3.c
@@ -1,14 +1,14 @@
-/* adler32.c -- compute the Adler-32 checksum of a data stream
+/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011 Mark Adler
* Authors:
+ * Adam Stylinski <kungfujesus06@gmail.com>
* Brian Bockelman <bockelman@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
-#include "../../zutil.h"
-
#include "../../adler32_p.h"
+#include "adler32_ssse3_p.h"
#ifdef X86_SSSE3_ADLER32
@@ -33,86 +33,124 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size
if (UNLIKELY(len < 16))
return adler32_len_16(adler, buf, len, sum2);
- uint32_t ALIGNED_(16) s1[4], s2[4];
-
- s1[0] = s1[1] = s1[2] = 0; s1[3] = adler;
- s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2;
-
- char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot1v = _mm_load_si128((__m128i*)dot1);
- char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
- __m128i dot2v = _mm_load_si128((__m128i*)dot2);
- short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1};
- __m128i dot3v = _mm_load_si128((__m128i*)dot3);
-
- // We will need to multiply by
- //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4};
+ const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
+ const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i dot3v = _mm_set1_epi16(1);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0,
+ vbuf_0, v_sad_sum2, vsum2, vsum2_0;
+
+ /* If our buffer is unaligned (likely), make the determination whether
+ * or not there's enough of a buffer to consume to make the scalar, aligning
+ * additions worthwhile or if it's worth it to just eat the cost of an unaligned
+ * load. This is a pretty simple test, just test if 16 - the remainder + len is
+ * < 16 */
+ size_t max_iters = NMAX;
+ size_t rem = (uintptr_t)buf & 15;
+ size_t align_offset = 16 - rem;
+ size_t k = 0;
+ if (rem) {
+ if (len < 16 + align_offset) {
+ /* Let's eat the cost of this one unaligned load so that
+ * we don't completely skip over the vectorization. Doing
+ * 16 bytes at a time unaligned is is better than 16 + <= 15
+ * sums */
+ vbuf = _mm_loadu_si128((__m128i*)buf);
+ len -= 16;
+ buf += 16;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs1_0 = vs1;
+ goto unaligned_jmp;
+ }
+
+ for (size_t i = 0; i < align_offset; ++i) {
+ adler += *(buf++);
+ sum2 += adler;
+ }
+
+ /* lop off the max number of sums based on the scalar sums done
+ * above */
+ len -= align_offset;
+ max_iters -= align_offset;
+ }
- char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- __m128i shiftv = _mm_load_si128((__m128i*)shift);
while (len >= 16) {
- __m128i vs1 = _mm_load_si128((__m128i*)s1);
- __m128i vs2 = _mm_load_si128((__m128i*)s2);
- __m128i vs1_0 = vs1;
-
- int k = (len < NMAX ? (int)len : NMAX);
- k -= k % 16;
- len -= k;
-
- while (k >= 16) {
- /*
- vs1 = adler + sum(c[i])
- vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
-
- NOTE: 256-bit equivalents are:
- _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts
- _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t.
- We could rewrite the below to use 256-bit instructions instead of 128-bit.
- */
- __m128i vbuf = _mm_loadu_si128((__m128i*)buf);
- buf += 16;
- k -= 16;
-
- __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts.
- __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t;
- __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
- vs1 = _mm_add_epi32(vsum1, vs1);
- __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
- vs1_0 = _mm_sll_epi32(vs1_0, shiftv);
- vsum2 = _mm_add_epi32(vsum2, vs2);
- vs2 = _mm_add_epi32(vsum2, vs1_0);
- vs1_0 = vs1;
- }
-
- // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that
- // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on.
-
- uint32_t ALIGNED_(16) s1_unpack[4];
- uint32_t ALIGNED_(16) s2_unpack[4];
-
- _mm_store_si128((__m128i*)s1_unpack, vs1);
- _mm_store_si128((__m128i*)s2_unpack, vs2);
-
- adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE);
- adler %= BASE;
- s1[3] = adler;
-
- sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE);
- sum2 %= BASE;
- s2[3] = sum2;
- }
-
- while (len) {
- len--;
- adler += *buf++;
- sum2 += adler;
+ vs1 = _mm_cvtsi32_si128(adler);
+ vs2 = _mm_cvtsi32_si128(sum2);
+ vs3 = _mm_setzero_si128();
+ vs2_0 = _mm_setzero_si128();
+ vs1_0 = vs1;
+
+ k = (len < max_iters ? len : max_iters);
+ k -= k % 16;
+ len -= k;
+
+ while (k >= 32) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ vbuf_0 = _mm_load_si128((__m128i*)(buf + 16));
+ buf += 32;
+ k -= 32;
+
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+
+ vs1 = _mm_add_epi32(v_sad_sum2, vs1);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v);
+ vs2_0 = _mm_add_epi32(vsum2_0, vs2_0);
+ vs1_0 = vs1;
+ }
+
+ vs2 = _mm_add_epi32(vs2_0, vs2);
+ vs3 = _mm_slli_epi32(vs3, 5);
+ vs2 = _mm_add_epi32(vs3, vs2);
+ vs3 = _mm_setzero_si128();
+
+ while (k >= 16) {
+ /*
+ vs1 = adler + sum(c[i])
+ vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] )
+ */
+ vbuf = _mm_load_si128((__m128i*)buf);
+ buf += 16;
+ k -= 16;
+
+unaligned_jmp:
+ v_sad_sum1 = _mm_sad_epu8(vbuf, zero);
+ vs1 = _mm_add_epi32(v_sad_sum1, vs1);
+ vs3 = _mm_add_epi32(vs1_0, vs3);
+ v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0);
+ vsum2 = _mm_madd_epi16(v_short_sum2, dot3v);
+ vs2 = _mm_add_epi32(vsum2, vs2);
+ vs1_0 = vs1;
+ }
+
+ vs3 = _mm_slli_epi32(vs3, 4);
+ vs2 = _mm_add_epi32(vs2, vs3);
+
+ /* We don't actually need to do a full horizontal sum, since psadbw is actually doing
+ * a partial reduction sum implicitly and only summing to integers in vector positions
+ * 0 and 2. This saves us some contention on the shuffle port(s) */
+ adler = partial_hsum(vs1) % BASE;
+ sum2 = hsum(vs2) % BASE;
+ max_iters = NMAX;
}
- adler %= BASE;
- sum2 %= BASE;
- /* return recombined sums */
- return adler | (sum2 << 16);
+ /* Process tail (len < 16). */
+ return adler32_len_16(adler, buf, len, sum2);
}
#endif
diff --git a/src/zlib-ng/arch/x86/adler32_ssse3_p.h b/src/zlib-ng/arch/x86/adler32_ssse3_p.h
new file mode 100644
index 0000000..ba914e1
--- /dev/null
+++ b/src/zlib-ng/arch/x86/adler32_ssse3_p.h
@@ -0,0 +1,29 @@
+/* adler32_ssse3_p.h -- adler32 ssse3 utility functions
+ * Copyright (C) 2022 Adam Stylinski
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef ADLER32_SSSE3_P_H_
+#define ADLER32_SSSE3_P_H_
+
+#ifdef X86_SSSE3_ADLER32
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static inline uint32_t partial_hsum(__m128i x) {
+ __m128i second_int = _mm_bsrli_si128(x, 8);
+ __m128i sum = _mm_add_epi32(x, second_int);
+ return _mm_cvtsi128_si32(sum);
+}
+
+static inline uint32_t hsum(__m128i x) {
+ __m128i sum1 = _mm_unpackhi_epi64(x, x);
+ __m128i sum2 = _mm_add_epi32(x, sum1);
+ __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01);
+ __m128i sum4 = _mm_add_epi32(sum2, sum3);
+ return _mm_cvtsi128_si32(sum4);
+}
+#endif
+
+#endif
diff --git a/src/zlib-ng/arch/x86/chunk_permute_table.h b/src/zlib-ng/arch/x86/chunk_permute_table.h
new file mode 100644
index 0000000..c7b2d2d
--- /dev/null
+++ b/src/zlib-ng/arch/x86/chunk_permute_table.h
@@ -0,0 +1,53 @@
+/* chunk_permute_table.h - shared AVX/SSE4 permutation table for use with chunkmemset family of functions.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifndef CHUNK_PERMUTE_TABLE_H_
+#define CHUNK_PERMUTE_TABLE_H_
+
+#include "zbuild.h"
+
+/* Need entries for all numbers not an even modulus for 1, 2, 4, 8, 16 & 32 */
+static const ALIGNED_(32) uint8_t permute_table[26*32] = {
+ 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, /* dist 3 */
+ 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, /* dist 5 */
+ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, /* dist 6 */
+ 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, /* dist 7 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, /* dist 9 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, /* dist 10 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 11 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 12 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, 3, 4, 5, /* dist 13 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, 2, 3, /* dist 14 */
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, 1, /* dist 15 */
+
+ /* Beyond dists of 15 means we have to permute from a vector > len(m128i). Because AVX couldn't permute
+ * beyond 128 bit lanes until AVX512 for sub 4-byte sequences, we have to do some math here for an eventual
+ * blend with a comparison. That means we need to wrap the indices with yet another derived table. For simplicity,
+ * we'll use absolute indexing here to derive a blend vector. This is actually a lot simpler with ARM's TBL, but,
+ * this is what we're dealt.
+ */
+
+ 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* dist 17 */
+ 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* dist 18 */
+ 16, 17, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, /* dist 19 */
+ 16, 17, 18, 19, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, /* dist 20 */
+ 16, 17, 18, 19, 20, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, /* dist 21 */
+ 16, 17, 18, 19, 20, 21, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, /* dist 22 */
+ 16, 17, 18, 19, 20, 21, 22, 0, 1, 2, 3, 4, 5, 6, 7, 8, /* dist 23 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, /* dist 24 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 0, 1, 2, 3, 4, 5, 6, /* dist 25 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 1, 2, 3, 4, 5, /* dist 26 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 0, 1, 2, 3, 4, /* dist 27 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 1, 2, 3, /* dist 28 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 0, 1, 2, /* dist 29 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 1, /* dist 30 */
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, /* dist 31 */
+};
+
+typedef struct lut_rem_pair_s {
+ uint16_t idx;
+ uint16_t remval;
+} lut_rem_pair;
+
+#endif
diff --git a/src/zlib-ng/arch/x86/chunkset_avx.c b/src/zlib-ng/arch/x86/chunkset_avx.c
index eb76c0d..91aaa45 100644
--- a/src/zlib-ng/arch/x86/chunkset_avx.c
+++ b/src/zlib-ng/arch/x86/chunkset_avx.c
@@ -2,32 +2,70 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
-#include "zutil.h"
#ifdef X86_AVX_CHUNKSET
#include <immintrin.h>
+#include "chunk_permute_table.h"
typedef __m256i chunk_t;
-#define HAVE_CHUNKMEMSET_1
+#define CHUNK_SIZE 32
+
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
-static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi8(*(int8_t *)from);
-}
+/* Populate don't cares so that this is a direct lookup (with some indirection into the permute table), because dist can
+ * never be 0 - 2, we'll start with an offset, subtracting 3 from the input */
+static const lut_rem_pair perm_idx_lut[29] = {
+ { 0, 2}, /* 3 */
+ { 0, 0}, /* don't care */
+ { 1 * 32, 2}, /* 5 */
+ { 2 * 32, 2}, /* 6 */
+ { 3 * 32, 4}, /* 7 */
+ { 0 * 32, 0}, /* don't care */
+ { 4 * 32, 5}, /* 9 */
+ { 5 * 32, 22}, /* 10 */
+ { 6 * 32, 21}, /* 11 */
+ { 7 * 32, 20}, /* 12 */
+ { 8 * 32, 6}, /* 13 */
+ { 9 * 32, 4}, /* 14 */
+ {10 * 32, 2}, /* 15 */
+ { 0 * 32, 0}, /* don't care */
+ {11 * 32, 15}, /* 17 */
+ {11 * 32 + 16, 14}, /* 18 */
+ {11 * 32 + 16 * 2, 13}, /* 19 */
+ {11 * 32 + 16 * 3, 12}, /* 20 */
+ {11 * 32 + 16 * 4, 11}, /* 21 */
+ {11 * 32 + 16 * 5, 10}, /* 22 */
+ {11 * 32 + 16 * 6, 9}, /* 23 */
+ {11 * 32 + 16 * 7, 8}, /* 24 */
+ {11 * 32 + 16 * 8, 7}, /* 25 */
+ {11 * 32 + 16 * 9, 6}, /* 26 */
+ {11 * 32 + 16 * 10, 5}, /* 27 */
+ {11 * 32 + 16 * 11, 4}, /* 28 */
+ {11 * 32 + 16 * 12, 3}, /* 29 */
+ {11 * 32 + 16 * 13, 2}, /* 30 */
+ {11 * 32 + 16 * 14, 1} /* 31 */
+};
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi16(*(int16_t *)from);
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm256_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi32(*(int32_t *)from);
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm256_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm256_set1_epi64x(*(int64_t *)from);
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm256_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
@@ -38,6 +76,50 @@ static inline void storechunk(uint8_t *out, chunk_t *chunk) {
_mm256_storeu_si256((__m256i *)out, *chunk);
}
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m256i ret_vec;
+ /* While technically we only need to read 4 or 8 bytes into this vector register for a lot of cases, GCC is
+ * compiling this to a shared load for all branches, preferring the simpler code. Given that the buf value isn't in
+ * GPRs to begin with the 256 bit load is _probably_ just as inexpensive */
+ *chunk_rem = lut_rem.remval;
+
+#ifdef Z_MEMORY_SANITIZER
+ /* See note in chunkset_sse4.c for why this is ok */
+ __msan_unpoison(buf + dist, 32 - dist);
+#endif
+
+ if (dist < 16) {
+ /* This simpler case still requires us to shuffle in 128 bit lanes, so we must apply a static offset after
+ * broadcasting the first vector register to both halves. This is _marginally_ faster than doing two separate
+ * shuffles and combining the halves later */
+ const __m256i permute_xform =
+ _mm256_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);
+ __m256i perm_vec = _mm256_load_si256((__m256i*)(permute_table+lut_rem.idx));
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ perm_vec = _mm256_add_epi8(perm_vec, permute_xform);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ ret_vec = _mm256_shuffle_epi8(ret_vec, perm_vec);
+ } else if (dist == 16) {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), ret_vec0, 1);
+ } else {
+ __m128i ret_vec0 = _mm_loadu_si128((__m128i*)buf);
+ __m128i ret_vec1 = _mm_loadu_si128((__m128i*)(buf + 16));
+ /* Take advantage of the fact that only the latter half of the 256 bit vector will actually differ */
+ __m128i perm_vec1 = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ __m128i xlane_permutes = _mm_cmpgt_epi8(_mm_set1_epi8(16), perm_vec1);
+ __m128i xlane_res = _mm_shuffle_epi8(ret_vec0, perm_vec1);
+ /* Since we can't wrap twice, we can simply keep the later half exactly how it is instead of having to _also_
+ * shuffle those values */
+ __m128i latter_half = _mm_blendv_epi8(ret_vec1, xlane_res, xlane_permutes);
+ ret_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(ret_vec0), latter_half, 1);
+ }
+
+ return ret_vec;
+}
+
#define CHUNKSIZE chunksize_avx
#define CHUNKCOPY chunkcopy_avx
#define CHUNKCOPY_SAFE chunkcopy_safe_avx
diff --git a/src/zlib-ng/arch/x86/chunkset_sse.c b/src/zlib-ng/arch/x86/chunkset_sse2.c
index 1d5a0fa..be195cf 100644
--- a/src/zlib-ng/arch/x86/chunkset_sse.c
+++ b/src/zlib-ng/arch/x86/chunkset_sse2.c
@@ -1,34 +1,36 @@
-/* chunkset_sse.c -- SSE inline functions to copy small data chunks.
+/* chunkset_sse2.c -- SSE2 inline functions to copy small data chunks.
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "zbuild.h"
-#include "zutil.h"
#ifdef X86_SSE2
#include <immintrin.h>
typedef __m128i chunk_t;
-#define HAVE_CHUNKMEMSET_1
+#define CHUNK_SIZE 16
+
#define HAVE_CHUNKMEMSET_2
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
-static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi8(*(int8_t *)from);
-}
-
static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi16(*(int16_t *)from);
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm_set1_epi16(tmp);
}
static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi32(*(int32_t *)from);
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm_set1_epi32(tmp);
}
static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
- *chunk = _mm_set1_epi64x(*(int64_t *)from);
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm_set1_epi64x(tmp);
}
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
diff --git a/src/zlib-ng/arch/x86/chunkset_sse41.c b/src/zlib-ng/arch/x86/chunkset_sse41.c
new file mode 100644
index 0000000..c6f9821
--- /dev/null
+++ b/src/zlib-ng/arch/x86/chunkset_sse41.c
@@ -0,0 +1,98 @@
+/* chunkset_sse41.c -- SSE4 inline functions to copy small data chunks.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "zbuild.h"
+
+/* This requires SSE2 support. While it's implicit with SSE4, we can minimize
+ * code size by sharing the chunkcopy functions, which will certainly compile
+ * to identical machine code */
+#if defined(X86_SSE41) && defined(X86_SSE2)
+#include <immintrin.h>
+#include "chunk_permute_table.h"
+
+typedef __m128i chunk_t;
+
+#define CHUNK_SIZE 16
+
+#define HAVE_CHUNKMEMSET_2
+#define HAVE_CHUNKMEMSET_4
+#define HAVE_CHUNKMEMSET_8
+#define HAVE_CHUNK_MAG
+#define HAVE_CHUNKCOPY
+#define HAVE_CHUNKUNROLL
+
+static const lut_rem_pair perm_idx_lut[13] = {
+ {0, 1}, /* 3 */
+ {0, 0}, /* don't care */
+ {1 * 32, 1}, /* 5 */
+ {2 * 32, 4}, /* 6 */
+ {3 * 32, 2}, /* 7 */
+ {0 * 32, 0}, /* don't care */
+ {4 * 32, 7}, /* 9 */
+ {5 * 32, 6}, /* 10 */
+ {6 * 32, 5}, /* 11 */
+ {7 * 32, 4}, /* 12 */
+ {8 * 32, 3}, /* 13 */
+ {9 * 32, 2}, /* 14 */
+ {10 * 32, 1},/* 15 */
+};
+
+
+static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
+ int16_t tmp;
+ zmemcpy_2(&tmp, from);
+ *chunk = _mm_set1_epi16(tmp);
+}
+
+static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
+ int32_t tmp;
+ zmemcpy_4(&tmp, from);
+ *chunk = _mm_set1_epi32(tmp);
+}
+
+static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
+ int64_t tmp;
+ zmemcpy_8(&tmp, from);
+ *chunk = _mm_set1_epi64x(tmp);
+}
+
+static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
+ *chunk = _mm_loadu_si128((__m128i *)s);
+}
+
+static inline void storechunk(uint8_t *out, chunk_t *chunk) {
+ _mm_storeu_si128((__m128i *)out, *chunk);
+}
+
+static inline chunk_t GET_CHUNK_MAG(uint8_t *buf, uint32_t *chunk_rem, uint32_t dist) {
+ lut_rem_pair lut_rem = perm_idx_lut[dist - 3];
+ __m128i perm_vec, ret_vec;
+#ifdef Z_MEMORY_SANITIZER
+ /* Important to note:
+ * This is _not_ to subvert the memory sanitizer but to instead unpoison some
+ * bytes we willingly and purposefully load unitialized that we swizzle over
+ * in a vector register, anyway. If what we assume is wrong about what is used,
+ * the memory sanitizer will still usefully flag it */
+ __msan_unpoison(buf + dist, 16 - dist);
+#endif
+ ret_vec = _mm_loadu_si128((__m128i*)buf);
+ *chunk_rem = lut_rem.remval;
+
+ perm_vec = _mm_load_si128((__m128i*)(permute_table + lut_rem.idx));
+ ret_vec = _mm_shuffle_epi8(ret_vec, perm_vec);
+
+ return ret_vec;
+}
+
+extern uint8_t* chunkcopy_sse2(uint8_t *out, uint8_t const *from, unsigned len);
+
+#define CHUNKSIZE chunksize_sse41
+#define CHUNKMEMSET chunkmemset_sse41
+#define CHUNKMEMSET_SAFE chunkmemset_safe_sse41
+#define CHUNKCOPY(a, b, c) chunkcopy_sse2(a, b, c)
+#define CHUNKUNROLL(a, b, c) chunkunroll_sse2(a, b, c)
+
+#include "chunkset_tpl.h"
+
+#endif
diff --git a/src/zlib-ng/arch/x86/compare258_avx.c b/src/zlib-ng/arch/x86/compare256_avx2.c
index d9108fd..1318a0e 100644
--- a/src/zlib-ng/arch/x86/compare258_avx.c
+++ b/src/zlib-ng/arch/x86/compare256_avx2.c
@@ -1,10 +1,9 @@
-/* compare258_avx.c -- AVX2 version of compare258
+/* compare256_avx2.c -- AVX2 version of compare256
* Copyright Mika T. Lindqvist <postmaster@raasu.org>
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "../../zbuild.h"
-#include "../../zutil.h"
#include "fallback_builtins.h"
@@ -15,8 +14,7 @@
# include <nmmintrin.h>
#endif
-/* UNALIGNED_OK, AVX2 intrinsic comparison */
-static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
+static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) {
uint32_t len = 0;
do {
@@ -47,20 +45,18 @@ static inline uint32_t compare256_unaligned_avx2_static(const unsigned char *src
return 256;
}
-static inline uint32_t compare258_unaligned_avx2_static(const unsigned char *src0, const unsigned char *src1) {
- if (*(uint16_t *)src0 != *(uint16_t *)src1)
- return (*src0 == *src1);
-
- return compare256_unaligned_avx2_static(src0+2, src1+2) + 2;
+Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_avx2_static(src0, src1);
}
-Z_INTERNAL uint32_t compare258_unaligned_avx2(const unsigned char *src0, const unsigned char *src1) {
- return compare258_unaligned_avx2_static(src0, src1);
-}
+#define LONGEST_MATCH longest_match_avx2
+#define COMPARE256 compare256_avx2_static
+
+#include "match_tpl.h"
-#define LONGEST_MATCH longest_match_unaligned_avx2
-#define COMPARE256 compare256_unaligned_avx2_static
-#define COMPARE258 compare258_unaligned_avx2_static
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_avx2
+#define COMPARE256 compare256_avx2_static
#include "match_tpl.h"
diff --git a/src/zlib-ng/arch/x86/compare256_sse2.c b/src/zlib-ng/arch/x86/compare256_sse2.c
new file mode 100644
index 0000000..aad4bd2
--- /dev/null
+++ b/src/zlib-ng/arch/x86/compare256_sse2.c
@@ -0,0 +1,96 @@
+/* compare256_sse2.c -- SSE2 version of compare256
+ * Copyright Adam Stylinski <kungfujesus06@gmail.com>
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#include "../../zbuild.h"
+
+#include "fallback_builtins.h"
+
+#if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ)
+
+#include <emmintrin.h>
+
+static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) {
+ uint32_t len = 0;
+ int align_offset = ((uintptr_t)src0) & 15;
+ const uint8_t *end0 = src0 + 256;
+ const uint8_t *end1 = src1 + 256;
+ __m128i xmm_src0, xmm_src1, xmm_cmp;
+
+ /* Do the first load unaligned, than all subsequent ones we have at least
+ * one aligned load. Sadly aligning both loads is probably unrealistic */
+ xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+
+ int align_adv = 16 - align_offset;
+ len += align_adv;
+ src0 += align_adv;
+ src1 += align_adv;
+
+ /* Do a flooring division (should just be a shift right) */
+ int num_iter = (256 - len) / 16;
+
+ for (int i = 0; i < num_iter; ++i) {
+ xmm_src0 = _mm_load_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ /* Compiler _may_ turn this branch into a ptest + movemask,
+ * since a lot of those uops are shared and fused */
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+
+ len += 16, src0 += 16, src1 += 16;
+ }
+
+ if (align_offset) {
+ src0 = end0 - 16;
+ src1 = end1 - 16;
+ len = 256 - 16;
+
+ xmm_src0 = _mm_loadu_si128((__m128i*)src0);
+ xmm_src1 = _mm_loadu_si128((__m128i*)src1);
+ xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1);
+
+ mask = (unsigned)_mm_movemask_epi8(xmm_cmp);
+
+ if (mask != 0xFFFF) {
+ uint32_t match_byte = (uint32_t)__builtin_ctz(~mask);
+ return len + match_byte;
+ }
+ }
+
+ return 256;
+}
+
+Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) {
+ return compare256_sse2_static(src0, src1);
+}
+
+#define LONGEST_MATCH longest_match_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#define LONGEST_MATCH_SLOW
+#define LONGEST_MATCH longest_match_slow_sse2
+#define COMPARE256 compare256_sse2_static
+
+#include "match_tpl.h"
+
+#endif
diff --git a/src/zlib-ng/arch/x86/compare258_sse.c b/src/zlib-ng/arch/x86/compare258_sse.c
deleted file mode 100644
index 17534c0..0000000
--- a/src/zlib-ng/arch/x86/compare258_sse.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/* compare258_sse.c -- SSE4.2 version of compare258
- *
- * Copyright (C) 2013 Intel Corporation. All rights reserved.
- * Authors:
- * Wajdi Feghali <wajdi.k.feghali@intel.com>
- * Jim Guilford <james.guilford@intel.com>
- * Vinodh Gopal <vinodh.gopal@intel.com>
- * Erdinc Ozturk <erdinc.ozturk@intel.com>
- * Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * Portions are Copyright (C) 2016 12Sided Technology, LLC.
- * Author:
- * Phil Vachon <pvachon@12sidedtech.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#include "../../zbuild.h"
-#include "../../zutil.h"
-
-#ifdef X86_SSE42_CMP_STR
-
-#include <immintrin.h>
-#ifdef _MSC_VER
-# include <nmmintrin.h>
-#endif
-
-/* UNALIGNED_OK, SSE4.2 intrinsic comparison */
-static inline uint32_t compare256_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
- uint32_t len = 0;
-
- do {
- #define mode _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY
- __m128i xmm_src0, xmm_src1;
- uint32_t ret;
-
- xmm_src0 = _mm_loadu_si128((__m128i *)src0);
- xmm_src1 = _mm_loadu_si128((__m128i *)src1);
- ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
- if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
- return len + ret;
- }
- src0 += 16, src1 += 16, len += 16;
-
- xmm_src0 = _mm_loadu_si128((__m128i *)src0);
- xmm_src1 = _mm_loadu_si128((__m128i *)src1);
- ret = (uint32_t)_mm_cmpestri(xmm_src0, 16, xmm_src1, 16, mode);
- if (_mm_cmpestrc(xmm_src0, 16, xmm_src1, 16, mode)) {
- return len + ret;
- }
- src0 += 16, src1 += 16, len += 16;
- } while (len < 256);
-
- return 256;
-}
-
-static inline uint32_t compare258_unaligned_sse4_static(const unsigned char *src0, const unsigned char *src1) {
- if (*(uint16_t *)src0 != *(uint16_t *)src1)
- return (*src0 == *src1);
-
- return compare256_unaligned_sse4_static(src0+2, src1+2) + 2;
-}
-
-Z_INTERNAL uint32_t compare258_unaligned_sse4(const unsigned char *src0, const unsigned char *src1) {
- return compare258_unaligned_sse4_static(src0, src1);
-}
-
-#define LONGEST_MATCH longest_match_unaligned_sse4
-#define COMPARE256 compare256_unaligned_sse4_static
-#define COMPARE258 compare258_unaligned_sse4_static
-
-#include "match_tpl.h"
-
-#endif
diff --git a/src/zlib-ng/arch/x86/crc_folding.c b/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq.c
index 49cdc99..6bb2c98 100644
--- a/src/zlib-ng/arch/x86/crc_folding.c
+++ b/src/zlib-ng/arch/x86/crc32_fold_pclmulqdq.c
@@ -3,9 +3,10 @@
* instruction.
*
* A white paper describing this algorithm can be found at:
- * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
@@ -17,23 +18,26 @@
*/
#ifdef X86_PCLMULQDQ_CRC
-
#include "../../zbuild.h"
-#include <inttypes.h>
+
#include <immintrin.h>
#include <wmmintrin.h>
+#include <smmintrin.h> // _mm_extract_epi32
-#include "crc_folding.h"
+#include "x86_features.h"
+#include "cpu_features.h"
-Z_INTERNAL void crc_fold_init(deflate_state *const s) {
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
- _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)s->crc0 + 2, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)s->crc0 + 3, _mm_setzero_si128());
+#include "../../crc32_fold.h"
+#include "../../crc32_braid_p.h"
+#include <assert.h>
- s->strm->adler = 0;
-}
+#ifdef X86_VPCLMULQDQ_CRC
+extern size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len);
+extern size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len, __m128i init_crc,
+ int32_t first);
+#endif
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
@@ -227,24 +231,45 @@ static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1,
*xmm_crc3 = _mm_castps_si128(ps_res);
}
-Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
+static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
+ *fold0 = _mm_load_si128(fold + 0);
+ *fold1 = _mm_load_si128(fold + 1);
+ *fold2 = _mm_load_si128(fold + 2);
+ *fold3 = _mm_load_si128(fold + 3);
+}
+
+static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) {
+ _mm_storeu_si128(fold + 0, fold0);
+ _mm_storeu_si128(fold + 1, fold1);
+ _mm_storeu_si128(fold + 2, fold2);
+ _mm_storeu_si128(fold + 3, fold3);
+}
+
+static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) {
+ _mm_store_si128(fold + 4, foldp);
+}
+
+Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
+ __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+ __m128i xmm_zero = _mm_setzero_si128();
+ crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero);
+ return 0;
+}
+
+Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part;
char ALIGNED_(16) partial_buf[16] = { 0 };
- /* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
- __m128i xmm_crc_part;
+ crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
if (len == 0)
return;
memcpy(partial_buf, src, len);
- xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
+ xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
goto partial;
}
@@ -263,20 +288,22 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
xmm_crc_part = _mm_setzero_si128();
}
- while ((len -= 64) >= 0) {
- /* CRC_LOAD */
- xmm_t0 = _mm_load_si128((__m128i *)src);
- xmm_t1 = _mm_load_si128((__m128i *)src + 1);
- xmm_t2 = _mm_load_si128((__m128i *)src + 2);
- xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+#ifdef X86_VPCLMULQDQ_CRC
+ if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+ size_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, dst, src, len);
+
+ len -= n;
+ src += n;
+ dst += n;
+ }
+#endif
+
+ while (len >= 64) {
+ crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3);
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
@@ -285,14 +312,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
src += 64;
dst += 64;
+ len -= 64;
}
/*
* len = num bytes left - 64
*/
- if (len + 16 >= 0) {
- len += 16;
-
+ if (len >= 48) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
@@ -306,15 +332,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
-
+ len -= 48;
if (len == 0)
goto done;
dst += 48;
memcpy(&xmm_crc_part, (__m128i *)src + 3, len);
- } else if (len + 32 >= 0) {
- len += 32;
-
+ } else if (len >= 32) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -326,14 +350,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+ len -= 32;
if (len == 0)
goto done;
dst += 32;
memcpy(&xmm_crc_part, (__m128i *)src + 2, len);
- } else if (len + 48 >= 0) {
- len += 48;
-
+ } else if (len >= 16) {
xmm_t0 = _mm_load_si128((__m128i *)src);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
@@ -342,13 +365,13 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+ len -= 16;
if (len == 0)
goto done;
dst += 16;
memcpy(&xmm_crc_part, (__m128i *)src + 1, len);
} else {
- len += 64;
if (len == 0)
goto done;
memcpy(&xmm_crc_part, src, len);
@@ -360,12 +383,132 @@ Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const
partial:
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);
- _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);
- _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);
- _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);
- _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);
+ crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
+ crc32_fold_save_partial((__m128i *)crc->fold, xmm_crc_part);
+}
+
+#define ONCE(op) if (first) { \
+ first = 0; \
+ (op); \
+}
+#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
+
+Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t len, uint32_t init_crc) {
+ unsigned long algn_diff;
+ __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part;
+ __m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
+ xmm_crc_part = _mm_setzero_si128();
+ int32_t first = init_crc != 0;
+
+ /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
+ * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
+ * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
+ * by definition can be up to 15 bytes + one full vector load. */
+ assert(len >= 31 || first == 0);
+ crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ if (len < 16) {
+ goto partial_nocpy;
+ }
+
+ algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
+ if (algn_diff) {
+ if (algn_diff >= 4 || init_crc == 0) {
+ xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+
+ src += algn_diff;
+ len -= algn_diff;
+
+ XOR_INITIAL(xmm_crc_part);
+ partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+ } else {
+ xmm_t0 = _mm_loadu_si128((__m128i*)src);
+ xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
+ XOR_INITIAL(xmm_t0);
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+ partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+
+ src += (algn_diff + 16);
+ len -= (algn_diff + 16);
+ }
+
+ xmm_crc_part = _mm_setzero_si128();
+ }
+
+#ifdef X86_VPCLMULQDQ_CRC
+ if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
+ size_t n = fold_16_vpclmulqdq_nocp(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len,
+ xmm_initial, first);
+ first = 0;
+
+ len -= n;
+ src += n;
+ }
+#endif
+
+ while (len >= 64) {
+ crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
+ XOR_INITIAL(xmm_t0);
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+ src += 64;
+ len -= 64;
+ }
+
+ /*
+ * len = num bytes left - 64
+ */
+ if (len >= 48) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+ XOR_INITIAL(xmm_t0);
+
+ fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+ len -= 48;
+ src += 48;
+ } else if (len >= 32) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+ XOR_INITIAL(xmm_t0);
+
+ fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+ len -= 32;
+ src += 32;
+ } else if (len >= 16) {
+ xmm_t0 = _mm_load_si128((__m128i *)src);
+ XOR_INITIAL(xmm_t0);
+
+ fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+ xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+ len -= 16;
+ src += 16;
+ }
+
+partial_nocpy:
+ if (len) {
+ memcpy(&xmm_crc_part, src, len);
+ partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
+ }
+
+ crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
}
static const unsigned ALIGNED_(16) crc_k[] = {
@@ -385,18 +528,13 @@ static const unsigned ALIGNED_(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
-uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
+Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
-
- uint32_t crc;
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
- /* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+ crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
/*
* k1
@@ -450,8 +588,21 @@ uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
- crc = (uint32_t)_mm_extract_epi32(xmm_crc3, 2);
- return ~crc;
+ crc->value = ~((uint32_t)_mm_extract_epi32(xmm_crc3, 2));
+
+ return crc->value;
+}
+
+uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
+ /* For lens < 64, crc32_braid method is faster. The CRC32 instruction for
+ * these short lengths might also prove to be effective */
+ if (len < 64)
+ return crc32_braid(crc32, buf, len);
+
+ crc32_fold ALIGNED_(16) crc_state;
+ crc32_fold_reset_pclmulqdq(&crc_state);
+ crc32_fold_pclmulqdq(&crc_state, buf, len, crc32);
+ return crc32_fold_final_pclmulqdq(&crc_state);
}
#endif
diff --git a/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c b/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c
new file mode 100644
index 0000000..dfcdc8a
--- /dev/null
+++ b/src/zlib-ng/arch/x86/crc32_fold_vpclmulqdq.c
@@ -0,0 +1,206 @@
+/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
+ * Copyright Wangyang Guo (wangyang.guo@intel.com)
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef X86_VPCLMULQDQ_CRC
+#include "../../zbuild.h"
+#include "../../fallback_builtins.h"
+
+#include <immintrin.h>
+
+#define ONCE(op) if (first) { \
+ first = 0; \
+ (op); \
+}
+#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
+
+size_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, size_t len) {
+ size_t len_tmp = len;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i z0, z1, z2, z3;
+ const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ // zmm register init
+ zmm_crc0 = _mm512_setzero_si512();
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ /* already have intermediate CRC in xmm registers
+ * fold4 with 4 xmm_crc to get zmm_crc0
+ */
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_crc1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_crc2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_crc3);
+ len -= 256;
+ src += 256;
+ dst += 256;
+
+ // fold-16 loops
+ while (len >= 256) {
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+ z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+ z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+ z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+ zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+ zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+ zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
+ zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
+ zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
+
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+ zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
+ zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
+ zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
+
+ _mm512_storeu_si512((__m512i *)dst, zmm_t0);
+ _mm512_storeu_si512((__m512i *)dst + 1, zmm_t1);
+ _mm512_storeu_si512((__m512i *)dst + 2, zmm_t2);
+ _mm512_storeu_si512((__m512i *)dst + 3, zmm_t3);
+ len -= 256;
+ src += 256;
+ dst += 256;
+ }
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+ *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+ *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+ *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+ return (len_tmp - len); // return n bytes processed
+}
+
+size_t fold_16_vpclmulqdq_nocp(__m128i *xmm_crc0, __m128i *xmm_crc1,
+ __m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, size_t len,
+ __m128i init_crc, int32_t first) {
+ size_t len_tmp = len;
+ __m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
+ __m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
+ __m512i z0, z1, z2, z3;
+ __m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
+ const __m512i zmm_fold4 = _mm512_set4_epi32(
+ 0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
+ const __m512i zmm_fold16 = _mm512_set4_epi32(
+ 0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
+
+ // zmm register init
+ zmm_crc0 = _mm512_setzero_si512();
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ XOR_INITIAL(zmm_t0);
+ zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ /* already have intermediate CRC in xmm registers
+ * fold4 with 4 xmm_crc to get zmm_crc0
+ */
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
+ zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+
+ len -= 256;
+ src += 256;
+
+ // fold-16 loops
+ while (len >= 256) {
+ zmm_t0 = _mm512_loadu_si512((__m512i *)src);
+ zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
+ zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
+ zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
+ z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
+ z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
+ z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
+
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
+ zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
+ zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
+ zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
+
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
+ zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
+ zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
+
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
+ zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
+ zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
+ zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
+
+ len -= 256;
+ src += 256;
+ }
+ // zmm_crc[0,1,2,3] -> zmm_crc0
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
+
+ z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
+ zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
+ zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
+ zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
+
+ // zmm_crc0 -> xmm_crc[0, 1, 2, 3]
+ *xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
+ *xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
+ *xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
+ *xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
+
+ return (len_tmp - len); // return n bytes processed
+}
+#endif
diff --git a/src/zlib-ng/arch/x86/crc_folding.h b/src/zlib-ng/arch/x86/crc_folding.h
deleted file mode 100644
index 0d3c24b..0000000
--- a/src/zlib-ng/arch/x86/crc_folding.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* crc_folding.h
- *
- * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
- * instruction.
- *
- * Copyright (C) 2013 Intel Corporation Jim Kukunas
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-#ifndef CRC_FOLDING_H_
-#define CRC_FOLDING_H_
-
-#include "../../deflate.h"
-
-Z_INTERNAL void crc_fold_init(deflate_state *const);
-Z_INTERNAL uint32_t crc_fold_512to32(deflate_state *const);
-Z_INTERNAL void crc_fold_copy(deflate_state *const, unsigned char *, const unsigned char *, long);
-
-#endif
diff --git a/src/zlib-ng/arch/x86/insert_string_sse.c b/src/zlib-ng/arch/x86/insert_string_sse42.c
index d0c316b..6fe4c81 100644
--- a/src/zlib-ng/arch/x86/insert_string_sse.c
+++ b/src/zlib-ng/arch/x86/insert_string_sse42.c
@@ -1,4 +1,4 @@
-/* insert_string_sse -- insert_string variant using SSE4.2's CRC instructions
+/* insert_string_sse42.c -- insert_string integer hash variant using SSE4.2's CRC instructions
*
* Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
@@ -14,22 +14,22 @@
#ifdef X86_SSE42_CRC_INTRIN
# ifdef _MSC_VER
-# define UPDATE_HASH(s, h, val)\
+# define HASH_CALC(s, h, val)\
h = _mm_crc32_u32(h, val)
# else
-# define UPDATE_HASH(s, h, val)\
+# define HASH_CALC(s, h, val)\
h = __builtin_ia32_crc32si(h, val)
# endif
#else
# ifdef _MSC_VER
-# define UPDATE_HASH(s, h, val) {\
+# define HASH_CALC(s, h, val) {\
__asm mov edx, h\
__asm mov eax, val\
__asm crc32 eax, edx\
- __asm mov val, eax\
+ __asm mov h, eax\
}
# else
-# define UPDATE_HASH(s, h, val) \
+# define HASH_CALC(s, h, val) \
__asm__ __volatile__ (\
"crc32 %1,%0\n\t"\
: "+r" (h)\
@@ -38,6 +38,10 @@
# endif
#endif
+#define HASH_CALC_VAR h
+#define HASH_CALC_VAR_INIT uint32_t h = 0
+
+#define UPDATE_HASH update_hash_sse4
#define INSERT_STRING insert_string_sse4
#define QUICK_INSERT_STRING quick_insert_string_sse4
diff --git a/src/zlib-ng/arch/x86/slide_avx.c b/src/zlib-ng/arch/x86/slide_hash_avx2.c
index be9a9b7..94fe10c 100644
--- a/src/zlib-ng/arch/x86/slide_avx.c
+++ b/src/zlib-ng/arch/x86/slide_hash_avx2.c
@@ -14,34 +14,26 @@
#include <immintrin.h>
-Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
- Pos *p;
- unsigned n;
- uint16_t wsize = (uint16_t)s->w_size;
- const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
+static inline void slide_hash_chain(Pos *table, uint32_t entries, const __m256i wsize) {
+ table += entries;
+ table -= 16;
- n = HASH_SIZE;
- p = &s->head[n] - 16;
do {
__m256i value, result;
- value = _mm256_loadu_si256((__m256i *)p);
- result= _mm256_subs_epu16(value, ymm_wsize);
- _mm256_storeu_si256((__m256i *)p, result);
- p -= 16;
- n -= 16;
- } while (n > 0);
+ value = _mm256_loadu_si256((__m256i *)table);
+ result = _mm256_subs_epu16(value, wsize);
+ _mm256_storeu_si256((__m256i *)table, result);
- n = wsize;
- p = &s->prev[n] - 16;
- do {
- __m256i value, result;
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+}
- value = _mm256_loadu_si256((__m256i *)p);
- result= _mm256_subs_epu16(value, ymm_wsize);
- _mm256_storeu_si256((__m256i *)p, result);
+Z_INTERNAL void slide_hash_avx2(deflate_state *s) {
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m256i ymm_wsize = _mm256_set1_epi16((short)wsize);
- p -= 16;
- n -= 16;
- } while (n > 0);
+ slide_hash_chain(s->head, HASH_SIZE, ymm_wsize);
+ slide_hash_chain(s->prev, wsize, ymm_wsize);
}
diff --git a/src/zlib-ng/arch/x86/slide_hash_sse2.c b/src/zlib-ng/arch/x86/slide_hash_sse2.c
new file mode 100644
index 0000000..5daac4a
--- /dev/null
+++ b/src/zlib-ng/arch/x86/slide_hash_sse2.c
@@ -0,0 +1,62 @@
+/*
+ * SSE optimized hash slide
+ *
+ * Copyright (C) 2017 Intel Corporation
+ * Authors:
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+#include "../../zbuild.h"
+#include "../../deflate.h"
+
+#include <immintrin.h>
+#include <assert.h>
+
+static inline void slide_hash_chain(Pos *table0, Pos *table1, uint32_t entries0,
+ uint32_t entries1, const __m128i wsize) {
+ uint32_t entries;
+ Pos *table;
+ __m128i value0, value1, result0, result1;
+
+ int on_chain = 0;
+
+next_chain:
+ table = (on_chain) ? table1 : table0;
+ entries = (on_chain) ? entries1 : entries0;
+
+ table += entries;
+ table -= 16;
+
+ /* ZALLOC allocates this pointer unless the user chose a custom allocator.
+ * Our alloc function is aligned to 64 byte boundaries */
+ do {
+ value0 = _mm_load_si128((__m128i *)table);
+ value1 = _mm_load_si128((__m128i *)(table + 8));
+ result0 = _mm_subs_epu16(value0, wsize);
+ result1 = _mm_subs_epu16(value1, wsize);
+ _mm_store_si128((__m128i *)table, result0);
+ _mm_store_si128((__m128i *)(table + 8), result1);
+
+ table -= 16;
+ entries -= 16;
+ } while (entries > 0);
+
+ ++on_chain;
+ if (on_chain > 1) {
+ return;
+ } else {
+ goto next_chain;
+ }
+}
+
+Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
+ uint16_t wsize = (uint16_t)s->w_size;
+ const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
+
+ assert(((uintptr_t)s->head & 15) == 0);
+ assert(((uintptr_t)s->prev & 15) == 0);
+
+ slide_hash_chain(s->head, s->prev, HASH_SIZE, wsize, xmm_wsize);
+}
diff --git a/src/zlib-ng/arch/x86/slide_sse.c b/src/zlib-ng/arch/x86/slide_sse.c
deleted file mode 100644
index abf4474..0000000
--- a/src/zlib-ng/arch/x86/slide_sse.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * SSE optimized hash slide
- *
- * Copyright (C) 2017 Intel Corporation
- * Authors:
- * Arjan van de Ven <arjan@linux.intel.com>
- * Jim Kukunas <james.t.kukunas@linux.intel.com>
- *
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-#include "../../zbuild.h"
-#include "../../deflate.h"
-
-#include <immintrin.h>
-
-Z_INTERNAL void slide_hash_sse2(deflate_state *s) {
- Pos *p;
- unsigned n;
- uint16_t wsize = (uint16_t)s->w_size;
- const __m128i xmm_wsize = _mm_set1_epi16((short)wsize);
-
- n = HASH_SIZE;
- p = &s->head[n] - 8;
- do {
- __m128i value, result;
-
- value = _mm_loadu_si128((__m128i *)p);
- result= _mm_subs_epu16(value, xmm_wsize);
- _mm_storeu_si128((__m128i *)p, result);
- p -= 8;
- n -= 8;
- } while (n > 0);
-
- n = wsize;
- p = &s->prev[n] - 8;
- do {
- __m128i value, result;
-
- value = _mm_loadu_si128((__m128i *)p);
- result= _mm_subs_epu16(value, xmm_wsize);
- _mm_storeu_si128((__m128i *)p, result);
-
- p -= 8;
- n -= 8;
- } while (n > 0);
-}
diff --git a/src/zlib-ng/arch/x86/x86.c b/src/zlib-ng/arch/x86/x86_features.c
index e782cb8..72ef885 100644
--- a/src/zlib-ng/arch/x86/x86.c
+++ b/src/zlib-ng/arch/x86/x86_features.c
@@ -1,5 +1,4 @@
-/*
- * x86 feature check
+/* x86_features.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
@@ -8,7 +7,7 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#include "../../zutil.h"
+#include "../../zbuild.h"
#ifdef _MSC_VER
# include <intrin.h>
@@ -17,11 +16,17 @@
# include <cpuid.h>
#endif
+#include <string.h>
+
Z_INTERNAL int x86_cpu_has_avx2;
+Z_INTERNAL int x86_cpu_has_avx512;
+Z_INTERNAL int x86_cpu_has_avx512vnni;
Z_INTERNAL int x86_cpu_has_sse2;
Z_INTERNAL int x86_cpu_has_ssse3;
+Z_INTERNAL int x86_cpu_has_sse41;
Z_INTERNAL int x86_cpu_has_sse42;
Z_INTERNAL int x86_cpu_has_pclmulqdq;
+Z_INTERNAL int x86_cpu_has_vpclmulqdq;
Z_INTERNAL int x86_cpu_has_tzcnt;
static void cpuid(int info, unsigned* eax, unsigned* ebx, unsigned* ecx, unsigned* edx) {
@@ -57,11 +62,11 @@ void Z_INTERNAL x86_check_features(void) {
unsigned maxbasic;
cpuid(0, &maxbasic, &ebx, &ecx, &edx);
-
cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, &eax, &ebx, &ecx, &edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_ssse3 = ecx & 0x200;
+ x86_cpu_has_sse41 = ecx & 0x80000;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
@@ -73,8 +78,12 @@ void Z_INTERNAL x86_check_features(void) {
x86_cpu_has_tzcnt = ebx & 0x8;
// check AVX2 bit
x86_cpu_has_avx2 = ebx & 0x20;
+ x86_cpu_has_avx512 = ebx & 0x00010000;
+ x86_cpu_has_avx512vnni = ecx & 0x800;
+ x86_cpu_has_vpclmulqdq = ecx & 0x400;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
+ x86_cpu_has_vpclmulqdq = 0;
}
}
diff --git a/src/zlib-ng/arch/x86/x86.h b/src/zlib-ng/arch/x86/x86_features.h
index 8471e15..97630ab 100644
--- a/src/zlib-ng/arch/x86/x86.h
+++ b/src/zlib-ng/arch/x86/x86_features.h
@@ -1,16 +1,20 @@
-/* cpu.h -- check for CPU features
+/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in zlib.h
*/
-#ifndef CPU_H_
-#define CPU_H_
+#ifndef X86_FEATURES_H_
+#define X86_FEATURES_H_
extern int x86_cpu_has_avx2;
+extern int x86_cpu_has_avx512;
+extern int x86_cpu_has_avx512vnni;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_ssse3;
+extern int x86_cpu_has_sse41;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
+extern int x86_cpu_has_vpclmulqdq;
extern int x86_cpu_has_tzcnt;
void Z_INTERNAL x86_check_features(void);