aboutsummaryrefslogtreecommitdiff
path: root/third_party/crc32c/src/src/crc32c_arm64.cc
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/crc32c/src/src/crc32c_arm64.cc')
-rw-r--r--third_party/crc32c/src/src/crc32c_arm64.cc124
1 files changed, 124 insertions, 0 deletions
diff --git a/third_party/crc32c/src/src/crc32c_arm64.cc b/third_party/crc32c/src/src/crc32c_arm64.cc
new file mode 100644
index 0000000000..9a988c1eed
--- /dev/null
+++ b/third_party/crc32c/src/src/crc32c_arm64.cc
@@ -0,0 +1,124 @@
+// Copyright 2017 The CRC32C Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "./crc32c_arm64.h"
+
+// In a separate source file to allow this accelerated CRC32C function to be
+// compiled with the appropriate compiler flags to enable ARM NEON CRC32C
+// instructions.
+
+// This implementation is based on https://github.com/google/leveldb/pull/490.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "./crc32c_internal.h"
+#include "crc32c/crc32c_config.h"
+
+#if HAVE_ARM64_CRC32C
+
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+#define KBYTES 1032
+#define SEGMENTBYTES 256
+
+// compute 8bytes for each segment parallelly
+#define CRC32C32BYTES(P, IND) \
+ do { \
+ crc1 = __crc32cd( \
+ crc1, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 1 + (IND))); \
+ crc2 = __crc32cd( \
+ crc2, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 2 + (IND))); \
+ crc3 = __crc32cd( \
+ crc3, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 3 + (IND))); \
+ crc0 = __crc32cd( \
+ crc0, *((const uint64_t *)(P) + (SEGMENTBYTES / 8) * 0 + (IND))); \
+ } while (0);
+
+// compute 8*8 bytes for each segment parallelly
+#define CRC32C256BYTES(P, IND) \
+ do { \
+ CRC32C32BYTES((P), (IND)*8 + 0) \
+ CRC32C32BYTES((P), (IND)*8 + 1) \
+ CRC32C32BYTES((P), (IND)*8 + 2) \
+ CRC32C32BYTES((P), (IND)*8 + 3) \
+ CRC32C32BYTES((P), (IND)*8 + 4) \
+ CRC32C32BYTES((P), (IND)*8 + 5) \
+ CRC32C32BYTES((P), (IND)*8 + 6) \
+ CRC32C32BYTES((P), (IND)*8 + 7) \
+ } while (0);
+
+// compute 4*8*8 bytes for each segment parallelly
+#define CRC32C1024BYTES(P) \
+ do { \
+ CRC32C256BYTES((P), 0) \
+ CRC32C256BYTES((P), 1) \
+ CRC32C256BYTES((P), 2) \
+ CRC32C256BYTES((P), 3) \
+ (P) += 4 * SEGMENTBYTES; \
+ } while (0)
+
+namespace crc32c {
+
+uint32_t ExtendArm64(uint32_t crc, const uint8_t *buf, size_t size) {
+ int64_t length = size;
+ uint32_t crc0, crc1, crc2, crc3;
+ uint64_t t0, t1, t2;
+
+ // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)),
+ // k2=CRC(x^(SEGMENTBYTES*8))
+ const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4;
+
+ crc = crc ^ kCRC32Xor;
+ const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
+
+ while (length >= KBYTES) {
+ crc0 = crc;
+ crc1 = 0;
+ crc2 = 0;
+ crc3 = 0;
+
+ // Process 1024 bytes in parallel.
+ CRC32C1024BYTES(p);
+
+ // Merge the 4 partial CRC32C values.
+ t2 = (uint64_t)vmull_p64(crc2, k2);
+ t1 = (uint64_t)vmull_p64(crc1, k1);
+ t0 = (uint64_t)vmull_p64(crc0, k0);
+ crc = __crc32cd(crc3, *(uint64_t *)p);
+ p += sizeof(uint64_t);
+ crc ^= __crc32cd(0, t2);
+ crc ^= __crc32cd(0, t1);
+ crc ^= __crc32cd(0, t0);
+
+ length -= KBYTES;
+ }
+
+ while (length >= 8) {
+ crc = __crc32cd(crc, *(uint64_t *)p);
+ p += 8;
+ length -= 8;
+ }
+
+ if (length & 4) {
+ crc = __crc32cw(crc, *(uint32_t *)p);
+ p += 4;
+ }
+
+ if (length & 2) {
+ crc = __crc32ch(crc, *(uint16_t *)p);
+ p += 2;
+ }
+
+ if (length & 1) {
+ crc = __crc32cb(crc, *p);
+ }
+
+ return crc ^ kCRC32Xor;
+}
+
+} // namespace crc32c
+
+#endif // HAVE_ARM64_CRC32C