summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTreehugger Robot <android-test-infra-autosubmit@system.gserviceaccount.com>2024-03-27 02:29:48 +0000
committerGerrit Code Review <noreply-gerritcodereview@google.com>2024-03-27 02:29:48 +0000
commit9c39311b68019eae34a7d094c1570682da3c15e2 (patch)
tree341695e6b1d6f6ca93b9f9b1c864f70735852b6e
parentbe9c729ed5f2c69d98ca1711652121f7ff7cfb5b (diff)
parent0e8a9da276950c61cf8df1be1ea5f22d037005fe (diff)
downloadzlib-9c39311b68019eae34a7d094c1570682da3c15e2.tar.gz
Merge "Upgrade zlib to 30bf3a72e77abb71568fa1e6258a0a731fef9ba3" into main
-rw-r--r--CMakeLists.txt2
-rw-r--r--METADATA4
-rw-r--r--adler32.c10
-rw-r--r--adler32_simd.c104
-rw-r--r--cpu_features.h3
5 files changed, 118 insertions, 5 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34175a7..c3f4247 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ set(ZLIB_SRCS
if (ENABLE_SIMD_OPTIMIZATIONS)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
message("RISCVV: Add optimizations.")
+ list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+ list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
else()
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
diff --git a/METADATA b/METADATA
index 3d557a3..370833b 100644
--- a/METADATA
+++ b/METADATA
@@ -9,11 +9,11 @@ third_party {
last_upgrade_date {
year: 2024
month: 3
- day: 18
+ day: 26
}
identifier {
type: "Git"
value: "https://chromium.googlesource.com/chromium/src/third_party/zlib/"
- version: "24c07df5033183efad8607cba62e746bea7180bf"
+ version: "30bf3a72e77abb71568fa1e6258a0a731fef9ba3"
}
}
diff --git a/adler32.c b/adler32.c
index ebd1889..de78b4e 100644
--- a/adler32.c
+++ b/adler32.c
@@ -58,7 +58,7 @@
#endif
#include "cpu_features.h"
-#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
#include "adler32_simd.h"
#endif
@@ -66,12 +66,16 @@
uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
unsigned long sum2;
unsigned n;
-
+ /* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
+ || defined(ADLER32_SIMD_RVV)
#if defined(ADLER32_SIMD_SSSE3)
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
- return adler32_simd_(adler, buf, len);
#elif defined(ADLER32_SIMD_NEON)
if (buf != Z_NULL && len >= 64)
+#elif defined(ADLER32_SIMD_RVV)
+ if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
+#endif
return adler32_simd_(adler, buf, len);
#endif
diff --git a/adler32_simd.c b/adler32_simd.c
index 58966ee..9970ea9 100644
--- a/adler32_simd.c
+++ b/adler32_simd.c
@@ -41,6 +41,9 @@
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
+/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
#include "adler32_simd.h"
@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
return s1 | (s2 << 16);
}
+#elif defined(ADLER32_SIMD_RVV)
+#include <riscv_vector.h>
+/* adler32_rvv.c - RVV version of Adler-32
+ * RVV 1.0 code contributed by Alex Chiang <alex.chiang@sifive.com>
+ * on https://github.com/zlib-ng/zlib-ng/pull/1532
+ * Port from Simon Hosie's fork:
+ * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
+ */
+
+uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */
+ uint32_t adler,
+ const unsigned char *buf,
+ unsigned long len)
+{
+ /* split Adler-32 into component sums */
+ uint32_t sum2 = (adler >> 16) & 0xffff;
+ adler &= 0xffff;
+
+ size_t left = len;
+ size_t vl = __riscv_vsetvlmax_e8m1();
+ vl = vl > 256 ? 256 : vl;
+ vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+ vuint16m2_t v_buf16_accu;
+
+ /*
+ * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+ * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+ * accumulators to boost performance.
+ *
+ * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+ * vl > 256 (255 * 256 <= UINT16_MAX).
+ *
+ * We accumulate 8-bit data into a 16-bit accumulator and then
+ * move the data into the 32-bit accumulator at the last iteration.
+ */
+ size_t block_size = (256 / vl) * vl;
+ size_t nmax_limit = (NMAX / block_size);
+ size_t cnt = 0;
+ while (left >= block_size) {
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t subprob = block_size;
+ while (subprob > 0) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ buf += vl;
+ subprob -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+ left -= block_size;
+ /* do modulo once each block of NMAX size */
+ if (++cnt >= nmax_limit) {
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ cnt = 0;
+ }
+ }
+ /* the left len <= 256 now, we can use 16-bit accum safely */
+ v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+ size_t res = left;
+ while (left >= vl) {
+ vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+ v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+ v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+ buf += vl;
+ left -= vl;
+ }
+ v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+ v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+ v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+ vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+ vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+ vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+ v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+ vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+ uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+
+ sum2 += (sum2_sum + adler * (len - left));
+
+ vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+ v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+ uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+
+ adler += adler_sum;
+
+ while (left--) {
+ adler += *buf++;
+ sum2 += adler;
+ }
+
+ sum2 %= BASE;
+ adler %= BASE;
+
+ return adler | (sum2 << 16);
+}
+
#endif /* ADLER32_SIMD_SSSE3 */
diff --git a/cpu_features.h b/cpu_features.h
index aed3e83..6092c7e 100644
--- a/cpu_features.h
+++ b/cpu_features.h
@@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
extern int x86_cpu_enable_simd;
extern int x86_cpu_enable_avx512;
+extern int riscv_cpu_enable_rvv;
+extern int riscv_cpu_enable_vclmul;
+
void cpu_check_features(void);