diff options
Diffstat (limited to 'src/zlib-ng/arch/x86/adler32_ssse3.c')
-rw-r--r-- | src/zlib-ng/arch/x86/adler32_ssse3.c | 194 |
1 files changed, 116 insertions, 78 deletions
diff --git a/src/zlib-ng/arch/x86/adler32_ssse3.c b/src/zlib-ng/arch/x86/adler32_ssse3.c index 101df4f..8c55bad 100644 --- a/src/zlib-ng/arch/x86/adler32_ssse3.c +++ b/src/zlib-ng/arch/x86/adler32_ssse3.c @@ -1,14 +1,14 @@ -/* adler32.c -- compute the Adler-32 checksum of a data stream +/* adler32_ssse3.c -- compute the Adler-32 checksum of a data stream * Copyright (C) 1995-2011 Mark Adler * Authors: + * Adam Stylinski <kungfujesus06@gmail.com> * Brian Bockelman <bockelman@gmail.com> * For conditions of distribution and use, see copyright notice in zlib.h */ #include "../../zbuild.h" -#include "../../zutil.h" - #include "../../adler32_p.h" +#include "adler32_ssse3_p.h" #ifdef X86_SSSE3_ADLER32 @@ -33,86 +33,124 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size if (UNLIKELY(len < 16)) return adler32_len_16(adler, buf, len, sum2); - uint32_t ALIGNED_(16) s1[4], s2[4]; - - s1[0] = s1[1] = s1[2] = 0; s1[3] = adler; - s2[0] = s2[1] = s2[2] = 0; s2[3] = sum2; - - char ALIGNED_(16) dot1[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot1v = _mm_load_si128((__m128i*)dot1); - char ALIGNED_(16) dot2[16] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}; - __m128i dot2v = _mm_load_si128((__m128i*)dot2); - short ALIGNED_(16) dot3[8] = {1, 1, 1, 1, 1, 1, 1, 1}; - __m128i dot3v = _mm_load_si128((__m128i*)dot3); - - // We will need to multiply by - //char ALIGNED_(16) shift[4] = {0, 0, 0, 4}; //{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4}; + const __m128i dot2v = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17); + const __m128i dot2v_0 = _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1); + const __m128i dot3v = _mm_set1_epi16(1); + const __m128i zero = _mm_setzero_si128(); + + __m128i vbuf, vs1_0, vs3, vs1, vs2, vs2_0, v_sad_sum1, v_short_sum2, v_short_sum2_0, + vbuf_0, v_sad_sum2, vsum2, vsum2_0; + + /* If our buffer is unaligned (likely), make the determination whether + * or not there's enough of a buffer to consume to make the scalar, aligning + * additions worthwhile or if it's worth it to just eat the cost of an unaligned + * load. This is a pretty simple test, just test if 16 - the remainder + len is + * < 16 */ + size_t max_iters = NMAX; + size_t rem = (uintptr_t)buf & 15; + size_t align_offset = 16 - rem; + size_t k = 0; + if (rem) { + if (len < 16 + align_offset) { + /* Let's eat the cost of this one unaligned load so that + * we don't completely skip over the vectorization. Doing + * 16 bytes at a time unaligned is is better than 16 + <= 15 + * sums */ + vbuf = _mm_loadu_si128((__m128i*)buf); + len -= 16; + buf += 16; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs1_0 = vs1; + goto unaligned_jmp; + } + + for (size_t i = 0; i < align_offset; ++i) { + adler += *(buf++); + sum2 += adler; + } + + /* lop off the max number of sums based on the scalar sums done + * above */ + len -= align_offset; + max_iters -= align_offset; + } - char ALIGNED_(16) shift[16] = {4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - __m128i shiftv = _mm_load_si128((__m128i*)shift); while (len >= 16) { - __m128i vs1 = _mm_load_si128((__m128i*)s1); - __m128i vs2 = _mm_load_si128((__m128i*)s2); - __m128i vs1_0 = vs1; - - int k = (len < NMAX ? (int)len : NMAX); - k -= k % 16; - len -= k; - - while (k >= 16) { - /* - vs1 = adler + sum(c[i]) - vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) - - NOTE: 256-bit equivalents are: - _mm256_maddubs_epi16 <- operates on 32 bytes to 16 shorts - _mm256_madd_epi16 <- Sums 16 shorts to 8 int32_t. - We could rewrite the below to use 256-bit instructions instead of 128-bit. - */ - __m128i vbuf = _mm_loadu_si128((__m128i*)buf); - buf += 16; - k -= 16; - - __m128i v_short_sum1 = _mm_maddubs_epi16(vbuf, dot1v); // multiply-add, resulting in 8 shorts. - __m128i vsum1 = _mm_madd_epi16(v_short_sum1, dot3v); // sum 8 shorts to 4 int32_t; - __m128i v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); - vs1 = _mm_add_epi32(vsum1, vs1); - __m128i vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); - vs1_0 = _mm_sll_epi32(vs1_0, shiftv); - vsum2 = _mm_add_epi32(vsum2, vs2); - vs2 = _mm_add_epi32(vsum2, vs1_0); - vs1_0 = vs1; - } - - // At this point, we have partial sums stored in vs1 and vs2. There are AVX512 instructions that - // would allow us to sum these quickly (VP4DPWSSD). For now, just unpack and move on. - - uint32_t ALIGNED_(16) s1_unpack[4]; - uint32_t ALIGNED_(16) s2_unpack[4]; - - _mm_store_si128((__m128i*)s1_unpack, vs1); - _mm_store_si128((__m128i*)s2_unpack, vs2); - - adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE); - adler %= BASE; - s1[3] = adler; - - sum2 = (s2_unpack[0] % BASE) + (s2_unpack[1] % BASE) + (s2_unpack[2] % BASE) + (s2_unpack[3] % BASE); - sum2 %= BASE; - s2[3] = sum2; - } - - while (len) { - len--; - adler += *buf++; - sum2 += adler; + vs1 = _mm_cvtsi32_si128(adler); + vs2 = _mm_cvtsi32_si128(sum2); + vs3 = _mm_setzero_si128(); + vs2_0 = _mm_setzero_si128(); + vs1_0 = vs1; + + k = (len < max_iters ? len : max_iters); + k -= k % 16; + len -= k; + + while (k >= 32) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + vbuf_0 = _mm_load_si128((__m128i*)(buf + 16)); + buf += 32; + k -= 32; + + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + v_sad_sum2 = _mm_sad_epu8(vbuf_0, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + + vs1 = _mm_add_epi32(v_sad_sum2, vs1); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + v_short_sum2_0 = _mm_maddubs_epi16(vbuf_0, dot2v_0); + vs2 = _mm_add_epi32(vsum2, vs2); + vsum2_0 = _mm_madd_epi16(v_short_sum2_0, dot3v); + vs2_0 = _mm_add_epi32(vsum2_0, vs2_0); + vs1_0 = vs1; + } + + vs2 = _mm_add_epi32(vs2_0, vs2); + vs3 = _mm_slli_epi32(vs3, 5); + vs2 = _mm_add_epi32(vs3, vs2); + vs3 = _mm_setzero_si128(); + + while (k >= 16) { + /* + vs1 = adler + sum(c[i]) + vs2 = sum2 + 16 vs1 + sum( (16-i+1) c[i] ) + */ + vbuf = _mm_load_si128((__m128i*)buf); + buf += 16; + k -= 16; + +unaligned_jmp: + v_sad_sum1 = _mm_sad_epu8(vbuf, zero); + vs1 = _mm_add_epi32(v_sad_sum1, vs1); + vs3 = _mm_add_epi32(vs1_0, vs3); + v_short_sum2 = _mm_maddubs_epi16(vbuf, dot2v_0); + vsum2 = _mm_madd_epi16(v_short_sum2, dot3v); + vs2 = _mm_add_epi32(vsum2, vs2); + vs1_0 = vs1; + } + + vs3 = _mm_slli_epi32(vs3, 4); + vs2 = _mm_add_epi32(vs2, vs3); + + /* We don't actually need to do a full horizontal sum, since psadbw is actually doing + * a partial reduction sum implicitly and only summing to integers in vector positions + * 0 and 2. This saves us some contention on the shuffle port(s) */ + adler = partial_hsum(vs1) % BASE; + sum2 = hsum(vs2) % BASE; + max_iters = NMAX; } - adler %= BASE; - sum2 %= BASE; - /* return recombined sums */ - return adler | (sum2 << 16); + /* Process tail (len < 16). */ + return adler32_len_16(adler, buf, len, sum2); } #endif |