aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarat Dukhan <maratek@google.com>2020-05-08 20:45:20 -0700
committerMarat Dukhan <maratek@google.com>2020-05-08 20:45:20 -0700
commit6ae95d3d7b407ac310e59958a92bc8be4583340e (patch)
treefb60f8d8c67d7c3290158d9926d4b4944d1758f0
parent5690b5ceada160444a916d31ef72e381f5e52d67 (diff)
downloadpthreadpool-6ae95d3d7b407ac310e59958a92bc8be4583340e.tar.gz
Use platform-specific yield/pause instructions
-rw-r--r--src/pthreads.c6
-rw-r--r--src/threadpool-atomics.h33
-rw-r--r--src/threadpool-utils.h13
-rw-r--r--src/windows.c6
4 files changed, 38 insertions, 20 deletions
diff --git a/src/pthreads.c b/src/pthreads.c
index 2d945a0..430ca79 100644
--- a/src/pthreads.c
+++ b/src/pthreads.c
@@ -108,8 +108,7 @@ static void wait_worker_threads(struct pthreadpool* threadpool) {
/* Spin-wait */
for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) {
- /* This fence serves as a sleep instruction */
- pthreadpool_fence_acquire();
+ pthreadpool_yield();
#if PTHREADPOOL_USE_FUTEX
has_active_threads = pthreadpool_load_acquire_uint32_t(&threadpool->has_active_threads);
@@ -151,8 +150,7 @@ static uint32_t wait_for_new_command(
if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) {
/* Spin-wait loop */
for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) {
- /* This fence serves as a sleep instruction */
- pthreadpool_fence_acquire();
+ pthreadpool_yield();
command = pthreadpool_load_acquire_uint32_t(&threadpool->command);
if (command != last_command) {
diff --git a/src/threadpool-atomics.h b/src/threadpool-atomics.h
index 135166f..a8491e4 100644
--- a/src/threadpool-atomics.h
+++ b/src/threadpool-atomics.h
@@ -4,12 +4,19 @@
#include <stddef.h>
#include <stdint.h>
+/* SSE-specific headers */
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+ #include <xmmintrin.h>
+#endif
+
+/* ARM-specific headers */
+#if defined(__ARM_ACLE)
+ #include <arm_acle.h>
+#endif
+
/* MSVC-specific headers */
#ifdef _MSC_VER
#include <intrin.h>
- #if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64)
- #include <immintrin.h>
- #endif
#endif
@@ -123,7 +130,7 @@
static inline void pthreadpool_fence_release() {
__c11_atomic_thread_fence(__ATOMIC_RELEASE);
}
-#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64))
+#elif defined(_MSC_VER) && defined(_M_X64)
typedef volatile uint32_t pthreadpool_atomic_uint32_t;
typedef volatile size_t pthreadpool_atomic_size_t;
typedef void *volatile pthreadpool_atomic_void_p;
@@ -701,3 +708,21 @@
#else
#error "Platform-specific implementation of threadpool-atomics.h required"
#endif
+
+#if defined(__i386__) || defined(__i686__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+ static inline void pthreadpool_yield() {
+ _mm_pause();
+ }
+#elif defined(__ARM_ACLE) || defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+ static inline void pthreadpool_yield() {
+ __yield();
+ }
+#elif defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+ static inline void pthreadpool_yield() {
+ __asm__ __volatile__("yield");
+ }
+#else
+ static inline void pthreadpool_yield() {
+ pthreadpool_fence_acquire();
+ }
+#endif
diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h
index d95e3a5..91e2445 100644
--- a/src/threadpool-utils.h
+++ b/src/threadpool-utils.h
@@ -4,21 +4,18 @@
#include <stddef.h>
/* SSE-specific headers */
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
#include <xmmintrin.h>
#endif
/* MSVC-specific headers */
#if defined(_MSC_VER)
#include <intrin.h>
- #if defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64)
- #include <immintrin.h>
- #endif
#endif
struct fpu_state {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
uint32_t mxcsr;
#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
uint32_t fpscr;
@@ -31,7 +28,7 @@ struct fpu_state {
static inline struct fpu_state get_fpu_state() {
struct fpu_state state = { 0 };
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
state.mxcsr = (uint32_t) _mm_getcsr();
#elif defined(_MSC_VER) && defined(_M_ARM)
state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0);
@@ -46,7 +43,7 @@ static inline struct fpu_state get_fpu_state() {
}
static inline void set_fpu_state(const struct fpu_state state) {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
_mm_setcsr((unsigned int) state.mxcsr);
#elif defined(_MSC_VER) && defined(_M_ARM)
_MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0);
@@ -60,7 +57,7 @@ static inline void set_fpu_state(const struct fpu_state state) {
}
static inline void disable_fpu_denormals() {
-#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
_mm_setcsr(_mm_getcsr() | 0x8040);
#elif defined(_MSC_VER) && defined(_M_ARM)
int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
diff --git a/src/windows.c b/src/windows.c
index 19e534f..c9b88f7 100644
--- a/src/windows.c
+++ b/src/windows.c
@@ -35,8 +35,7 @@ static void wait_worker_threads(struct pthreadpool* threadpool, uint32_t event_i
/* Spin-wait */
for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) {
- /* This fence serves as a sleep instruction */
- pthreadpool_fence_acquire();
+ pthreadpool_yield();
active_threads = pthreadpool_load_acquire_size_t(&threadpool->active_threads);
if (active_threads == 0) {
@@ -63,8 +62,7 @@ static uint32_t wait_for_new_command(
if ((last_flags & PTHREADPOOL_FLAG_YIELD_WORKERS) == 0) {
/* Spin-wait loop */
for (uint32_t i = PTHREADPOOL_SPIN_WAIT_ITERATIONS; i != 0; i--) {
- /* This fence serves as a sleep instruction */
- pthreadpool_fence_acquire();
+ pthreadpool_yield();
command = pthreadpool_load_acquire_uint32_t(&threadpool->command);
if (command != last_command) {