[zlib][riscv] Import RVV 1.0 based version of Adler-32.

Adding a vectorized version of Adler-32, as it should help with the zlib wrapper for DEFLATE and PNG decoding. The original code was written by Alex Chiang and imported in the Cloudflare zlib fork by Simon Hosie. Average decompression gain was +14.4% (and only +1% for compression) with the zlib wrapper running in a Kendryte K230 board. Bug: 329282661 Change-Id: I9ccae19c46240c6ee517e24ce142e0fe600f4321 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5378739 Reviewed-by: Hans Wennborg <hans@chromium.org> Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org> Cr-Commit-Position: refs/heads/main@{#1276438} NOKEYCHECK=True GitOrigin-RevId: c0e7820262df6b9e69252babe4ffc1cccc1af135
author: Adenilson Cavalcanti <cavalcantii@chromium.org> 2024-03-21 19:34:47 +0000
committer: Copybara-Service <copybara-worker@google.com> 2024-03-26 12:04:09 -0700
commit: 30bf3a72e77abb71568fa1e6258a0a731fef9ba3 (patch)
tree: fc44a0344e82eaa898405fb2722ecff5e02bac23
parent: 24c07df5033183efad8607cba62e746bea7180bf (diff)
download: zlib-30bf3a72e77abb71568fa1e6258a0a731fef9ba3.tar.gz
4 files changed, 116 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34175a7..c3f4247 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ set(ZLIB_SRCS
 if (ENABLE_SIMD_OPTIMIZATIONS)
   if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
     message("RISCVV: Add optimizations.")
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
     list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
     list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
   else()
     list(REMOVE_ITEM ZLIB_SRCS inflate.c)
diff --git a/adler32.c b/adler32.c
index ebd1889..de78b4e 100644
--- a/adler32.c
+++ b/adler32.c
@@ -58,7 +58,7 @@
 #endif
 
 #include "cpu_features.h"
-#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
 #include "adler32_simd.h"
 #endif
 
@@ -66,12 +66,16 @@
 uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
     unsigned long sum2;
     unsigned n;
-
+    /* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
+    || defined(ADLER32_SIMD_RVV)
 #if defined(ADLER32_SIMD_SSSE3)
     if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
-        return adler32_simd_(adler, buf, len);
 #elif defined(ADLER32_SIMD_NEON)
     if (buf != Z_NULL && len >= 64)
+#elif defined(ADLER32_SIMD_RVV)
+    if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
+#endif
         return adler32_simd_(adler, buf, len);
 #endif
 
diff --git a/adler32_simd.c b/adler32_simd.c
index 58966ee..9970ea9 100644
--- a/adler32_simd.c
+++ b/adler32_simd.c
@@ -41,6 +41,9 @@
  * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
  * of the adler s1 s2 of uint32_t type (see adler32.c).
  */
+/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
 
 #include "adler32_simd.h"
 
@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
     return s1 | (s2 << 16);
 }
 
+#elif defined(ADLER32_SIMD_RVV)
+#include <riscv_vector.h>
+/* adler32_rvv.c - RVV version of Adler-32
+ * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
+ * on https://github.com/zlib-ng/zlib-ng/pull/1532
+ * Port from Simon Hosie's fork:
+ * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
+ */
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* RVV */
+    uint32_t adler,
+    const unsigned char *buf,
+    unsigned long len)
+{
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    size_t left = len;
+    size_t vl = __riscv_vsetvlmax_e8m1();
+    vl = vl > 256 ? 256 : vl;
+    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint16m2_t v_buf16_accu;
+
+    /*
+     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+     * accumulators to boost performance.
+     *
+     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+     * vl > 256 (255 * 256 <= UINT16_MAX).
+     *
+     * We accumulate 8-bit data into a 16-bit accumulator and then
+     * move the data into the 32-bit accumulator at the last iteration.
+     */
+    size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
+    while (left >= block_size) {
+        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+        size_t subprob = block_size;
+        while (subprob > 0) {
+            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+            buf += vl;
+            subprob -= vl;
+        }
+        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+        left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            cnt = 0;
+        }
+    }
+    /* the left len <= 256 now, we can use 16-bit accum safely */
+    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+    size_t res = left;
+    while (left >= vl) {
+        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+        buf += vl;
+        left -= vl;
+    }
+    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+
+    sum2 += (sum2_sum + adler * (len - left));
+
+    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+
+    adler += adler_sum;
+
+    while (left--) {
+        adler += *buf++;
+        sum2 += adler;
+    }
+
+    sum2 %= BASE;
+    adler %= BASE;
+
+    return adler | (sum2 << 16);
+}
+
 #endif  /* ADLER32_SIMD_SSSE3 */
diff --git a/cpu_features.h b/cpu_features.h
index aed3e83..6092c7e 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
 extern int x86_cpu_enable_simd;
 extern int x86_cpu_enable_avx512;
 
+extern int riscv_cpu_enable_rvv;
+extern int riscv_cpu_enable_vclmul;
+
 void cpu_check_features(void);
author	Adenilson Cavalcanti <cavalcantii@chromium.org>	2024-03-21 19:34:47 +0000
committer	Copybara-Service <copybara-worker@google.com>	2024-03-26 12:04:09 -0700
commit	30bf3a72e77abb71568fa1e6258a0a731fef9ba3 (patch)
tree	fc44a0344e82eaa898405fb2722ecff5e02bac23
parent	24c07df5033183efad8607cba62e746bea7180bf (diff)
download	zlib-30bf3a72e77abb71568fa1e6258a0a731fef9ba3.tar.gz