From 0672a7e1eb789b35709037ab5231d8d8a0f3bafc Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Sat, 2 May 2020 22:29:51 -0700 Subject: Fast path using atomic decrement instead of atomic compare-and-swap 50% higher throughput on x86 (disabled on other platforms) --- src/threadpool-object.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'src/threadpool-object.h') diff --git a/src/threadpool-object.h b/src/threadpool-object.h index 239d116..7b643c6 100644 --- a/src/threadpool-object.h +++ b/src/threadpool-object.h @@ -526,3 +526,55 @@ PTHREADPOOL_INTERNAL void pthreadpool_parallelize( void* context, size_t linear_range, uint32_t flags); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_with_uarch_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_1d_tile_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_6d_tile_2d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); -- cgit v1.2.3 From bfe07ff3d9ed6eb5e7803b9761c85b254a417742 Mon Sep 17 00:00:00 2001 From: Marat Dukhan Date: Tue, 26 May 2020 09:41:08 -0700 Subject: 3D/4D/5D parallelization functions with 1D or no tiling --- src/threadpool-object.h | 160 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) (limited to 'src/threadpool-object.h') diff --git a/src/threadpool-object.h b/src/threadpool-object.h index 7b643c6..9870e8a 100644 --- a/src/threadpool-object.h +++ b/src/threadpool-object.h @@ -179,6 +179,36 @@ struct pthreadpool_2d_tile_2d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_j; }; +struct pthreadpool_3d_params { + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_3d function. + */ + struct fxdiv_divisor_size_t range_k; +}; + +struct pthreadpool_3d_tile_1d_params { + /** + * Copy of the range_k argument passed to the pthreadpool_parallelize_3d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the tile_k argument passed to the pthreadpool_parallelize_3d_tile_1d function. + */ + size_t tile_k; + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_3d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the divide_round_up(range_k, tile_k) value. + */ + struct fxdiv_divisor_size_t tile_range_k; +}; + struct pthreadpool_3d_tile_2d_params { /** * Copy of the range_j argument passed to the pthreadpool_parallelize_3d_tile_2d function. @@ -241,6 +271,52 @@ struct pthreadpool_3d_tile_2d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_k; }; +struct pthreadpool_4d_params { + /** + * Copy of the range_k argument passed to the pthreadpool_parallelize_4d function. + */ + size_t range_k; + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * range_l value. + */ + struct fxdiv_divisor_size_t range_kl; + /** + * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_4d function. + */ + struct fxdiv_divisor_size_t range_l; +}; + +struct pthreadpool_4d_tile_1d_params { + /** + * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the range_l argument passed to the pthreadpool_parallelize_4d_tile_1d function. + */ + size_t range_l; + /** + * Copy of the tile_l argument passed to the pthreadpool_parallelize_4d_tile_1d function. + */ + size_t tile_l; + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_4d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_kl; + /** + * FXdiv divisor for the divide_round_up(range_l, tile_l) value. + */ + struct fxdiv_divisor_size_t tile_range_l; +}; + struct pthreadpool_4d_tile_2d_params { /** * Copy of the range_k argument passed to the pthreadpool_parallelize_4d_tile_2d function. @@ -311,6 +387,60 @@ struct pthreadpool_4d_tile_2d_with_uarch_params { struct fxdiv_divisor_size_t tile_range_l; }; +struct pthreadpool_5d_params { + /** + * Copy of the range_l argument passed to the pthreadpool_parallelize_5d function. + */ + size_t range_l; + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k argument passed to the pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_k; + /** + * FXdiv divisor for the range_l * range_m value. + */ + struct fxdiv_divisor_size_t range_lm; + /** + * FXdiv divisor for the range_m argument passed to the pthreadpool_parallelize_5d function. + */ + struct fxdiv_divisor_size_t range_m; +}; + +struct pthreadpool_5d_tile_1d_params { + /** + * Copy of the range_k argument passed to the pthreadpool_parallelize_5d_tile_1d function. + */ + size_t range_k; + /** + * Copy of the range_m argument passed to the pthreadpool_parallelize_5d_tile_1d function. + */ + size_t range_m; + /** + * Copy of the tile_m argument passed to the pthreadpool_parallelize_5d_tile_1d function. + */ + size_t tile_m; + /** + * FXdiv divisor for the range_j argument passed to the pthreadpool_parallelize_5d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_j; + /** + * FXdiv divisor for the range_k * range_l value. + */ + struct fxdiv_divisor_size_t range_kl; + /** + * FXdiv divisor for the range_l argument passed to the pthreadpool_parallelize_5d_tile_1d function. + */ + struct fxdiv_divisor_size_t range_l; + /** + * FXdiv divisor for the divide_round_up(range_m, tile_m) value. + */ + struct fxdiv_divisor_size_t tile_range_m; +}; + struct pthreadpool_5d_tile_2d_params { /** * Copy of the range_l argument passed to the pthreadpool_parallelize_5d_tile_2d function. @@ -434,10 +564,16 @@ struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { struct pthreadpool_2d_tile_1d_params parallelize_2d_tile_1d; struct pthreadpool_2d_tile_2d_params parallelize_2d_tile_2d; struct pthreadpool_2d_tile_2d_with_uarch_params parallelize_2d_tile_2d_with_uarch; + struct pthreadpool_3d_params parallelize_3d; + struct pthreadpool_3d_tile_1d_params parallelize_3d_tile_1d; struct pthreadpool_3d_tile_2d_params parallelize_3d_tile_2d; struct pthreadpool_3d_tile_2d_with_uarch_params parallelize_3d_tile_2d_with_uarch; + struct pthreadpool_4d_params parallelize_4d; + struct pthreadpool_4d_tile_1d_params parallelize_4d_tile_1d; struct pthreadpool_4d_tile_2d_params parallelize_4d_tile_2d; struct pthreadpool_4d_tile_2d_with_uarch_params parallelize_4d_tile_2d_with_uarch; + struct pthreadpool_5d_params parallelize_5d; + struct pthreadpool_5d_tile_1d_params parallelize_5d_tile_1d; struct pthreadpool_5d_tile_2d_params parallelize_5d_tile_2d; struct pthreadpool_6d_tile_2d_params parallelize_6d_tile_2d; } params; @@ -555,6 +691,14 @@ PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_2d_tile_2d_with_uarch_f struct pthreadpool* threadpool, struct thread_info* thread); +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_fastpath( struct pthreadpool* threadpool, struct thread_info* thread); @@ -563,6 +707,14 @@ PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_3d_tile_2d_with_uarch_f struct pthreadpool* threadpool, struct thread_info* thread); +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_fastpath( struct pthreadpool* threadpool, struct thread_info* thread); @@ -571,6 +723,14 @@ PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_4d_tile_2d_with_uarch_f struct pthreadpool* threadpool, struct thread_info* thread); +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + +PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_1d_fastpath( + struct pthreadpool* threadpool, + struct thread_info* thread); + PTHREADPOOL_INTERNAL void pthreadpool_thread_parallelize_5d_tile_2d_fastpath( struct pthreadpool* threadpool, struct thread_info* thread); -- cgit v1.2.3