diff options
author | Adenilson Cavalcanti <cavalcantii@chromium.org> | 2024-04-05 21:50:32 +0000 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2024-04-05 14:55:41 -0700 |
commit | d076d8bd089843ae105b1aeeda32dbeb667402ef (patch) | |
tree | 2970399bd7f74cb806889708f0e2d560bedcc801 | |
parent | 0e58d440d69cc64ccbe542ea1a968b9befb01544 (diff) | |
download | zlib-d076d8bd089843ae105b1aeeda32dbeb667402ef.tar.gz |
[zlib][riscv] Implement generic chunk_copy
Back in 2017, Simon Hosie implemented chunk_copy for Arm using NEON
instructions, which was later ported to x86-64 by Noel Gordon.
The basic idea is to perform wide loads and stores while doing
data decompression (i.e. load a single wide vector instead of single byte).
The current chunk_copy can be easily ported to other architectures that use
fixed length vectors/registers, but doesn't scale so well for architectures
with varied vector lengths (e.g. Arm SVE or RISCV RVV 1.0).
In any case, it is possible to have a *generic* chunk_copy** relying on the
compiler builtins memcopy/memset and this patch introduces this functionality
in Chromium zlib.
One important detail is that chunk_copy was coded *before* read64le (an
optimization suggested by Nigel Tao that requires unaligned loads) and it is
a requirement for both read64le and unconditional decoding of literals
(suggested by Dougall Johnson).
The penalty of unaligned loads in read64le can actually negate the benefits of chunk_copy,
which is why we rely on clang flags to allow code generation that deals with
the issue.
The current patch yielded an average gain of +9.5% on a K230 board, with higher
gains for some important content like HTML (+16%) and source code (+11.6%).
** Link:
https://github.com/cloudflare/zlib/commit/063def93f91a3f5e463646fb3fe6da5c8705f8e8
Bug: 329282661
Change-Id: Ia32a4a1fed16169a59cd39775fa68f4e675dac09
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5402331
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1283414}
NOKEYCHECK=True
GitOrigin-RevId: cb959c56ec21abb0526f52b5f66a07fba7b6b145
-rw-r--r-- | CMakeLists.txt | 18 | ||||
-rw-r--r-- | contrib/optimizations/chunkcopy.h | 75 |
2 files changed, 90 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index c3f4247..5db4a6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,9 +79,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS) add_definitions(-DRISCV_RVV) add_definitions(-DDEFLATE_SLIDE_HASH_RVV) add_definitions(-DADLER32_SIMD_RVV) - #TODO(cavalcantii): add remaining flags as we port optimizations to RVV. - # Required by CPU features detection code. - SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv") + + # TODO(cavalcantii): add remaining flags as we port optimizations to RVV. + # chunk_copy is required for READ64 and unconditional decode of literals. + add_definitions(-DINFLATE_CHUNK_GENERIC) + add_definitions(-DINFLATE_CHUNK_READ_64LE) + + # Tested with clang-17, unaligned loads are required by read64 & chunk_copy. + # TODO(cavalcantii): replace internal clang flags for -munaligned-access + # when we have a newer compiler available. + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem") endif() endif() @@ -192,9 +199,14 @@ set(ZLIB_SRCS if (ENABLE_SIMD_OPTIMIZATIONS) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") message("RISCVV: Add optimizations.") + list(REMOVE_ITEM ZLIB_SRCS inflate.c) list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h) list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c) list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c) else() list(REMOVE_ITEM ZLIB_SRCS inflate.c) diff --git a/contrib/optimizations/chunkcopy.h b/contrib/optimizations/chunkcopy.h index f40546d..97efff3 100644 --- a/contrib/optimizations/chunkcopy.h +++ b/contrib/optimizations/chunkcopy.h @@ -21,8 +21,10 @@ #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__) #define Z_BUILTIN_MEMCPY __builtin_memcpy +#define Z_BUILTIN_MEMSET __builtin_memset #else #define Z_BUILTIN_MEMCPY zmemcpy +#define Z_BUILTIN_MEMSET zmemset #endif #if defined(INFLATE_CHUNK_SIMD_NEON) @@ -31,6 +33,8 @@ typedef uint8x16_t z_vec128i_t; #elif defined(INFLATE_CHUNK_SIMD_SSE2) #include <emmintrin.h> typedef __m128i z_vec128i_t; +#elif defined(INFLATE_CHUNK_GENERIC) +typedef struct { uint8_t x[16]; } z_vec128i_t; #else #error chunkcopy.h inflate chunk SIMD is not defined for your build target #endif @@ -265,6 +269,77 @@ static inline z_vec128i_t v_load8_dup(const void* src) { static inline void v_store_128(void* out, const z_vec128i_t vec) { _mm_storeu_si128((__m128i*)out, vec); } +#elif defined(INFLATE_CHUNK_GENERIC) +/* + * Default implementations for chunk-copy functions rely on memcpy() being + * inlined by the compiler for best performance. This is most likely to work + * as expected when the length argument is constant (as is the case here) and + * the target supports unaligned loads and stores. Since that's not always a + * safe assumption, this may need extra compiler arguments such as + * `-mno-strict-align` or `-munaligned-access`, or the availability of + * extensions like SIMD. + */ + +/* + * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in + * every 64-bit component of the 128-bit result (64-bit int splat). + */ +static inline z_vec128i_t v_load64_dup(const void* src) { + int64_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in + * every 32-bit component of the 128-bit result (32-bit int splat). + */ +static inline z_vec128i_t v_load32_dup(const void* src) { + int32_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in + * every 16-bit component of the 128-bit result (16-bit int splat). + */ +static inline z_vec128i_t v_load16_dup(const void* src) { + int16_t in; + Z_BUILTIN_MEMCPY(&in, src, sizeof(in)); + z_vec128i_t out; + for (int i = 0; i < sizeof(out); i += sizeof(in)) { + Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in)); + } + return out; +} + +/* + * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit + * component of the 128-bit result (8-bit int splat). + */ +static inline z_vec128i_t v_load8_dup(const void* src) { + int8_t in = *(const uint8_t*)src; + z_vec128i_t out; + Z_BUILTIN_MEMSET(&out, in, sizeof(out)); + return out; +} + +/* + * v_store_128(): store the 128-bit vec in a memory destination (that might + * not be 16-byte aligned) void* out. + */ +static inline void v_store_128(void* out, const z_vec128i_t vec) { + Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec)); +} #endif /* |