aboutsummaryrefslogtreecommitdiff
path: root/include/fxdiv.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/fxdiv.h')
-rw-r--r--include/fxdiv.h36
1 files changed, 26 insertions, 10 deletions
diff --git a/include/fxdiv.h b/include/fxdiv.h
index 21a3dc1..f5a09d0 100644
--- a/include/fxdiv.h
+++ b/include/fxdiv.h
@@ -14,10 +14,13 @@
#if defined(_MSC_VER)
#include <intrin.h>
+ #if defined(_M_IX86) || defined(_M_X64)
+ #include <immintrin.h>
+ #endif
#endif
#ifndef FXDIV_USE_INLINE_ASSEMBLY
- #define FXDIV_USE_INLINE_ASSEMBLY 1
+ #define FXDIV_USE_INLINE_ASSEMBLY 0
#endif
static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
@@ -121,14 +124,15 @@ static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
const uint32_t l_minus_1 = 31 - clz(d - 1);
#elif defined(__CUDA_ARCH__)
const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
- #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+ #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
unsigned long l_minus_1;
_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
uint32_t l_minus_1;
__asm__("BSRL %[d_minus_1], %[l_minus_1]"
: [l_minus_1] "=r" (l_minus_1)
- : [d_minus_1] "r" (d - 1));
+ : [d_minus_1] "r" (d - 1)
+ : "cc");
#elif defined(__GNUC__)
const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
#else
@@ -167,7 +171,11 @@ static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
uint32_t q;
__asm__("DIVL %[d]"
: "=a" (q), "+d" (u_hi)
- : [d] "r" (d), "a" (0));
+ : [d] "r" (d), "a" (0)
+ : "cc");
+ #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && (defined(_M_IX86) || defined(_M_X64))
+ unsigned int remainder;
+ const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
#else
const uint32_t q = ((uint64_t) u_hi << 32) / d;
#endif
@@ -192,13 +200,13 @@ static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
#elif defined(__CUDA_ARCH__)
const uint32_t nlz_d = __clzll((long long) d);
const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
- #elif defined(_MSC_VER) && defined(_M_X64)
+ #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
unsigned long l_minus_1;
_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
unsigned long bsr_d;
_BitScanReverse64(&bsr_d, (unsigned __int64) d);
const uint32_t nlz_d = bsr_d ^ 0x3F;
- #elif defined(_MSC_VER) && defined(_M_IX86)
+ #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
const uint64_t d_minus_1 = d - 1;
const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
unsigned long l_minus_1;
@@ -213,7 +221,8 @@ static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
uint64_t l_minus_1;
__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
: [l_minus_1] "=r" (l_minus_1)
- : [d_minus_1] "r" (d - 1));
+ : [d_minus_1] "r" (d - 1)
+ : "cc");
#elif defined(__GNUC__)
const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
const uint32_t nlz_d = __builtin_clzll(d);
@@ -221,8 +230,8 @@ static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
/* Based on Algorithm 2 from Hacker's delight */
const uint64_t d_minus_1 = d - 1;
const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
- uint64_t l_minus_1 = 0;
- uint32_t x = d_minus_1;
+ uint32_t l_minus_1 = 0;
+ uint32_t x = (uint32_t) d_minus_1;
uint32_t y = d_minus_1 >> 32;
if (y != 0) {
l_minus_1 += 32;
@@ -260,7 +269,14 @@ static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
uint64_t q;
__asm__("DIVQ %[d]"
: "=a" (q), "+d" (u_hi)
- : [d] "r" (d), "a" (UINT64_C(0)));
+ : [d] "r" (d), "a" (UINT64_C(0))
+ : "cc");
+ #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
+ /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
+ const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
+ #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && defined(_M_X64)
+ unsigned __int64 remainder;
+ const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
#else
/* Implementation based on code from Hacker's delight */