diff options
author | Marat Dukhan <maratek@google.com> | 2020-05-08 20:45:20 -0700 |
---|---|---|
committer | Marat Dukhan <maratek@google.com> | 2020-05-08 20:45:20 -0700 |
commit | 6ae95d3d7b407ac310e59958a92bc8be4583340e (patch) | |
tree | fb60f8d8c67d7c3290158d9926d4b4944d1758f0 | |
parent | 5690b5ceada160444a916d31ef72e381f5e52d67 (diff) | |
download | pthreadpool-6ae95d3d7b407ac310e59958a92bc8be4583340e.tar.gz |
Use platform-specific yield/pause instructions
-rw-r--r-- | src/pthreads.c | 6 | ||||
-rw-r--r-- | src/threadpool-atomics.h | 33 | ||||
-rw-r--r-- | src/threadpool-utils.h | 13 | ||||
-rw-r--r-- | src/windows.c | 6 |
4 files changed, 38 insertions, 20 deletions
diff --git a/src/pthreads.c b/src/pthreads.c index 2d945a0..430ca79 100644 --- a/src/pthreads.c +++ b/src/pthreads.c @@ -108,8 +108,7 @@ static void wait_worker_threads(struct pthreadpool* threadpool) { /* Spin-wait */ for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - /* This fence serves as a sleep instruction */ - pthreadpool_fence_acquire(); + pthreadpool_yield(); #if PTHREADPOOL_USE_FUTEX has_active_threads = pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads); @@ -151,8 +150,7 @@ static uint32_t wait_for_new_command( if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { /* Spin-wait loop */ for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - /* This fence serves as a sleep instruction */ - pthreadpool_fence_acquire(); + pthreadpool_yield(); command = pthreadpool_load_acquire_uint32_t(&threadpool->command); if (command != last_command) { diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h index 135166f..a8491e4 100644 --- a/src/threadpool-atomics.h +++ b/src/threadpool-atomics.h @@ -4,12 +4,19 @@ #include <stddef.h> #include <stdint.h> +/* SSE-specific headers */ +#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + #include <xmmintrin.h> +#endif + +/* ARM-specific headers */ +#if defined(__ARM_ACLE) + #include <arm_acle.h> +#endif + /* MSVC-specific headers */ #ifdef _MSC_VER #include <intrin.h> - #if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) - #include <immintrin.h> - #endif #endif @@ -123,7 +130,7 @@ static inline void pthreadpool_fence_release() { __c11_atomic_thread_fence(__ATOMIC_RELEASE); } -#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)) +#elif defined(_MSC_VER) && defined(_M_X64) typedef volatile uint32_t pthreadpool_atomic_uint32_t; typedef volatile size_t pthreadpool_atomic_size_t; typedef void *volatile pthreadpool_atomic_void_p; @@ -701,3 +708,21 @@ #else #error "Platform-specific implementation of threadpool-atomics.h required" #endif + +#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + static inline void pthreadpool_yield() { + _mm_pause(); + } +#elif defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) + static inline void pthreadpool_yield() { + __yield(); + } +#elif defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__)) + static inline void pthreadpool_yield() { + __asm__ __volatile__("yield"); + } +#else + static inline void pthreadpool_yield() { + pthreadpool_fence_acquire(); + } +#endif diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h index d95e3a5..91e2445 100644 --- a/src/threadpool-utils.h +++ b/src/threadpool-utils.h @@ -4,21 +4,18 @@ #include <stddef.h> /* SSE-specific headers */ -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) #include <xmmintrin.h> #endif /* MSVC-specific headers */ #if defined(_MSC_VER) #include <intrin.h> - #if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) - #include <immintrin.h> - #endif #endif struct fpu_state { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) uint32_t mxcsr; #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM) uint32_t fpscr; @@ -31,7 +28,7 @@ struct fpu_state { static inline struct fpu_state get_fpu_state() { struct fpu_state state = { 0 }; -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) state.mxcsr = (uint32_t) _mm_getcsr(); #elif defined(_MSC_VER) && defined(_M_ARM) state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0); @@ -46,7 +43,7 @@ static inline struct fpu_state get_fpu_state() { } static inline void set_fpu_state(const struct fpu_state state) { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) _mm_setcsr((unsigned int) state.mxcsr); #elif defined(_MSC_VER) && defined(_M_ARM) _MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0); @@ -60,7 +57,7 @@ static inline void set_fpu_state(const struct fpu_state state) { } static inline void disable_fpu_denormals() { -#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) +#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) _mm_setcsr(_mm_getcsr() | 0x8040); #elif defined(_MSC_VER) && defined(_M_ARM) int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); diff --git a/src/windows.c b/src/windows.c index 19e534f..c9b88f7 100644 --- a/src/windows.c +++ b/src/windows.c @@ -35,8 +35,7 @@ static void wait_worker_threads(struct pthreadpool* threadpool, uint32_t event_i /* Spin-wait */ for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - /* This fence serves as a sleep instruction */ - pthreadpool_fence_acquire(); + pthreadpool_yield(); active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads); if (active_threads == 0) { @@ -63,8 +62,7 @@ static uint32_t wait_for_new_command( if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) { /* Spin-wait loop */ for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) { - /* This fence serves as a sleep instruction */ - pthreadpool_fence_acquire(); + pthreadpool_yield(); command = pthreadpool_load_acquire_uint32_t(&threadpool->command); if (command != last_command) { |