1 files changed, 195 insertions, 7 deletions
diff --git a/crc32_simd.c b/crc32_simd.c
index c8e5592..d80beba 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -1,6 +1,6 @@
 /* crc32_simd.c
  *
- * Copyright 2017 The Chromium Authors. All rights reserved.
+ * Copyright 2017 The Chromium Authors
  * Use of this source code is governed by a BSD-style license that can be
  * found in the Chromium source repository LICENSE file.
  */
@@ -157,11 +157,16 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
 #elif defined(CRC32_ARMV8_CRC32)
 
 /* CRC32 checksums using ARMv8-a crypto instructions.
- *
- * TODO: implement a version using the PMULL instruction.
  */
 
 #if defined(__clang__)
+/* We need some extra types for using PMULL.
+ */
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#include <arm_acle.h>
+#endif
+
 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
  * armv8 target, which is incompatible with ThinLTO optimizations on Android.
  * (Namely, mixing and matching different module-level targets makes ThinLTO
@@ -177,14 +182,21 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
  * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
  * feature for this target (ignoring feature)." This appears to be a harmless
  * bug in clang.
+ *
+ * These definitions must appear *after* including arm_acle.h otherwise that
+ * header may end up defining functions named __builtin_arm_crc32* that call
+ * themselves, creating an infinite loop when the intrinsic is called.
  */
+/* XXX: Cannot hook into builtins with XCode for arm64. */
+#if !defined(ARMV8_OS_MACOS)
 #define __crc32b __builtin_arm_crc32b
 #define __crc32d __builtin_arm_crc32d
 #define __crc32w __builtin_arm_crc32w
 #define __crc32cw __builtin_arm_crc32cw
+#endif
 
 #if defined(__aarch64__)
-#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
+#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
 #else  // !defined(__aarch64__)
 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
 #endif  // defined(__aarch64__)
@@ -194,15 +206,17 @@ uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
  * allowed. We can just include arm_acle.h.
  */
 #include <arm_acle.h>
+#include <arm_neon.h>
 #define TARGET_ARMV8_WITH_CRC
 #else  // !defined(__GNUC__) && !defined(_aarch64__)
 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
 #endif
 
 TARGET_ARMV8_WITH_CRC
-uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
-                                          const unsigned char *buf,
-                                          z_size_t len)
+uint32_t ZLIB_INTERNAL armv8_crc32_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
 {
     uint32_t c = (uint32_t) ~crc;
 
@@ -240,4 +254,178 @@ uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
     return ~c;
 }
 
+#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
+
+/*
+ * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
+    x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
+    x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
+    x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+    x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
+
+    x0 = vld1q_u64(k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x6 = (uint64x2_t) pmull_lo(x2, x0);
+        x7 = (uint64x2_t) pmull_lo(x3, x0);
+        x8 = (uint64x2_t) pmull_lo(x4, x0);
+
+        y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
+        y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
+        y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
+        y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x2 = (uint64x2_t) pmull_hi(x2, x0);
+        x3 = (uint64x2_t) pmull_hi(x3, x0);
+        x4 = (uint64x2_t) pmull_hi(x4, x0);
+
+        x1 = veorq_u64(x1, x5);
+        x2 = veorq_u64(x2, x6);
+        x3 = veorq_u64(x3, x7);
+        x4 = veorq_u64(x4, x8);
+
+        x1 = veorq_u64(x1, y5);
+        x2 = veorq_u64(x2, y6);
+        x3 = veorq_u64(x3, y7);
+        x4 = veorq_u64(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = vld1q_u64(k3k4);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x2);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x3);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x4);
+    x1 = veorq_u64(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = vld1q_u64((const uint64_t *)buf);
+
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x1 = veorq_u64(x1, x2);
+        x1 = veorq_u64(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
+
+    x2 = (uint64x2_t) pmull_01(x1, x0);
+    x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
+    x3 = (uint64x2_t) vld1q_u32(mask);
+    x1 = veorq_u64(x1, x2);
+
+    x0 = vld1q_u64(k5k0);
+
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
+    x1 = vandq_u64(x1, x3);
+    x1 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = vld1q_u64(poly);
+
+    x2 = vandq_u64(x1, x3);
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = vandq_u64(x2, x3);
+    x2 = (uint64x2_t) pmull_lo(x2, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
+}
+#endif /* aarch64 specific code. */
+
 #endif