diff options
author | Marat Dukhan <maratek@google.com> | 2020-05-26 09:41:08 -0700 |
---|---|---|
committer | Marat Dukhan <maratek@google.com> | 2020-05-26 09:41:08 -0700 |
commit | bfe07ff3d9ed6eb5e7803b9761c85b254a417742 (patch) | |
tree | 67d36b2fdee49d80dfdba2e9be39251da7a7cd42 /src/portable-api.c | |
parent | afb880df0639945c854d70269f6b403cb81518b5 (diff) | |
download | pthreadpool-bfe07ff3d9ed6eb5e7803b9761c85b254a417742.tar.gz |
3D/4D/5D parallelization functions with 1D or no tiling
Diffstat (limited to 'src/portable-api.c')
-rw-r--r-- | src/portable-api.c | 667 |
1 files changed, 667 insertions, 0 deletions
diff --git a/src/portable-api.c b/src/portable-api.c index 6b16674..6a8ccf2 100644 --- a/src/portable-api.c +++ b/src/portable-api.c @@ -328,6 +328,106 @@ static void thread_parallelize_2d_tile_2d_with_uarch(struct pthreadpool* threadp pthreadpool_fence_release(); } +static void thread_parallelize_3d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_t task = (pthreadpool_task_3d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_3d.range_k; + const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(range_start, range_k); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k); + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(linear_index, range_k); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_3d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_3d_tile_1d_t task = (pthreadpool_task_3d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_k = threadpool->params.parallelize_3d_tile_1d.tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(range_start, tile_range_k); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_3d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t tile_k = threadpool->params.parallelize_3d_tile_1d.tile_k; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t start_k = tile_index_ij_k.remainder * tile_k; + + const size_t range_k = threadpool->params.parallelize_3d_tile_1d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, start_k, min(range_k - start_k, tile_k)); + start_k += tile_k; + if (start_k >= range_k) { + start_k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const size_t start_k = tile_index_ij_k.remainder * tile_k; + task(argument, index_i_j.quotient, index_i_j.remainder, start_k, min(range_k - start_k, tile_k)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_3d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); assert(thread != NULL); @@ -449,6 +549,122 @@ static void thread_parallelize_3d_tile_2d_with_uarch(struct pthreadpool* threadp pthreadpool_fence_release(); } +static void thread_parallelize_4d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_t task = (pthreadpool_task_4d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_4d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(range_start, range_kl); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_4d.range_l; + const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + + const size_t range_k = threadpool->params.parallelize_4d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l); + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(linear_index, range_kl); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_4d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_4d_tile_1d_t task = (pthreadpool_task_4d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_kl = threadpool->params.parallelize_4d_tile_1d.tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(range_start, tile_range_kl); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_4d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_l = threadpool->params.parallelize_4d_tile_1d.tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t tile_l = threadpool->params.parallelize_4d_tile_1d.tile_l; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = tile_index_k_l.quotient; + size_t start_l = tile_index_k_l.remainder * tile_l; + + const size_t range_k = threadpool->params.parallelize_4d_tile_1d.range_k; + const size_t range_l = threadpool->params.parallelize_4d_tile_1d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, start_l, min(range_l - start_l, tile_l)); + start_l += tile_l; + if (start_l >= range_l) { + start_l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t start_l = tile_index_k_l.remainder * tile_l; + task(argument, index_i_j.quotient, index_i_j.remainder, tile_index_k_l.quotient, start_l, min(range_l - start_l, tile_l)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_4d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); assert(thread != NULL); @@ -584,6 +800,137 @@ static void thread_parallelize_4d_tile_2d_with_uarch(struct pthreadpool* threadp pthreadpool_fence_release(); } +static void thread_parallelize_5d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_t task = (pthreadpool_task_5d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t range_lm = threadpool->params.parallelize_5d.range_lm; + const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(range_start, range_lm); + const struct fxdiv_divisor_size_t range_k = threadpool->params.parallelize_5d.range_k; + const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t range_m = threadpool->params.parallelize_5d.range_m; + const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_ij_k.remainder; + size_t l = index_l_m.quotient; + size_t m = index_l_m.remainder; + + const size_t range_l = threadpool->params.parallelize_5d.range_l; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, m); + if (++m == range_m.value) { + m = 0; + if (++l == range_l) { + l = 0; + if (++k == range_k.value) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t index_ijk_lm = fxdiv_divide_size_t(linear_index, range_lm); + const struct fxdiv_result_size_t index_ij_k = fxdiv_divide_size_t(index_ijk_lm.quotient, range_k); + const struct fxdiv_result_size_t index_l_m = fxdiv_divide_size_t(index_ijk_lm.remainder, range_m); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_k.quotient, range_j); + task(argument, index_i_j.quotient, index_i_j.remainder, index_ij_k.remainder, index_l_m.quotient, index_l_m.remainder); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + +static void thread_parallelize_5d_tile_1d(struct pthreadpool* threadpool, struct thread_info* thread) { + assert(threadpool != NULL); + assert(thread != NULL); + + const pthreadpool_task_5d_tile_1d_t task = (pthreadpool_task_5d_tile_1d_t) pthreadpool_load_relaxed_void_p(&threadpool->task); + void *const argument = pthreadpool_load_relaxed_void_p(&threadpool->argument); + + /* Process thread's own range of items */ + const size_t range_start = pthreadpool_load_relaxed_size_t(&thread->range_start); + const struct fxdiv_divisor_size_t tile_range_m = threadpool->params.parallelize_5d_tile_1d.tile_range_m; + const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(range_start, tile_range_m); + const struct fxdiv_divisor_size_t range_kl = threadpool->params.parallelize_5d_tile_1d.range_kl; + const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_divisor_size_t range_j = threadpool->params.parallelize_5d_tile_1d.range_j; + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_divisor_size_t range_l = threadpool->params.parallelize_5d_tile_1d.range_l; + const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + const size_t tile_m = threadpool->params.parallelize_5d_tile_1d.tile_m; + size_t i = index_i_j.quotient; + size_t j = index_i_j.remainder; + size_t k = index_k_l.quotient; + size_t l = index_k_l.remainder; + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + + const size_t range_m = threadpool->params.parallelize_5d_tile_1d.range_m; + const size_t range_k = threadpool->params.parallelize_5d_tile_1d.range_k; + while (pthreadpool_try_decrement_relaxed_size_t(&thread->range_length)) { + task(argument, i, j, k, l, start_m, min(range_m - start_m, tile_m)); + start_m += tile_m; + if (start_m >= range_m) { + start_m = 0; + if (++l == range_l.value) { + l = 0; + if (++k == range_k) { + k = 0; + if (++j == range_j.value) { + j = 0; + i += 1; + } + } + } + } + } + + /* There still may be other threads with work */ + const size_t thread_number = thread->thread_number; + const size_t threads_count = threadpool->threads_count.value; + for (size_t tid = modulo_decrement(thread_number, threads_count); + tid != thread_number; + tid = modulo_decrement(tid, threads_count)) + { + struct thread_info* other_thread = &threadpool->threads[tid]; + while (pthreadpool_try_decrement_relaxed_size_t(&other_thread->range_length)) { + const size_t linear_index = pthreadpool_decrement_fetch_relaxed_size_t(&other_thread->range_end); + const struct fxdiv_result_size_t tile_index_ijkl_m = fxdiv_divide_size_t(linear_index, tile_range_m); + const struct fxdiv_result_size_t index_ij_kl = fxdiv_divide_size_t(tile_index_ijkl_m.quotient, range_kl); + const struct fxdiv_result_size_t index_i_j = fxdiv_divide_size_t(index_ij_kl.quotient, range_j); + const struct fxdiv_result_size_t index_k_l = fxdiv_divide_size_t(index_ij_kl.remainder, range_l); + size_t start_m = tile_index_ijkl_m.remainder * tile_m; + task(argument, index_i_j.quotient, index_i_j.remainder, index_k_l.quotient, index_k_l.remainder, start_m, + min(range_m - start_m, tile_m)); + } + } + + /* Make changes by this thread visible to other threads */ + pthreadpool_fence_release(); +} + static void thread_parallelize_5d_tile_2d(struct pthreadpool* threadpool, struct thread_info* thread) { assert(threadpool != NULL); assert(thread != NULL); @@ -1061,6 +1408,102 @@ void pthreadpool_parallelize_2d_tile_2d_with_uarch( } } +void pthreadpool_parallelize_3d( + pthreadpool_t threadpool, + pthreadpool_task_3d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + task(argument, i, j, k); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range = range_i * range_j * range_k; + const struct pthreadpool_3d_params params = { + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + }; + thread_function_t parallelize_3d = &thread_parallelize_3d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_3d = &pthreadpool_thread_parallelize_3d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_3d, ¶ms, sizeof(params), + task, argument, range, flags); + } +} + +void pthreadpool_parallelize_3d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t tile_k, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j) <= 1 && range_k <= tile_k)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k += tile_k) { + task(argument, i, j, k, min(range_k - k, tile_k)); + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range = range_i * range_j * tile_range_k; + const struct pthreadpool_3d_tile_1d_params params = { + .range_k = range_k, + .tile_k = tile_k, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + }; + thread_function_t parallelize_3d_tile_1d = &thread_parallelize_3d_tile_1d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_3d_tile_1d = &pthreadpool_thread_parallelize_3d_tile_1d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_3d_tile_1d, ¶ms, sizeof(params), + task, argument, tile_range, flags); + } +} + void pthreadpool_parallelize_3d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_t task, @@ -1182,6 +1625,114 @@ void pthreadpool_parallelize_3d_tile_2d_with_uarch( } } +void pthreadpool_parallelize_4d( + pthreadpool_t threadpool, + pthreadpool_task_4d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + task(argument, i, j, k, l); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_kl = range_k * range_l; + const size_t range = range_i * range_j * range_kl; + const struct pthreadpool_4d_params params = { + .range_k = range_k, + .range_j = fxdiv_init_size_t(range_j), + .range_kl = fxdiv_init_size_t(range_kl), + .range_l = fxdiv_init_size_t(range_l), + }; + thread_function_t parallelize_4d = &thread_parallelize_4d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_4d = &pthreadpool_thread_parallelize_4d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_4d, ¶ms, sizeof(params), + task, argument, range, flags); + } +} + +void pthreadpool_parallelize_4d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_4d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t tile_l, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k) <= 1 && range_l <= tile_l)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + task(argument, i, j, k, l, min(range_l - l, tile_l)); + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_kl = range_k * tile_range_l; + const size_t tile_range = range_i * range_j * tile_range_kl; + const struct pthreadpool_4d_tile_1d_params params = { + .range_k = range_k, + .range_l = range_l, + .tile_l = tile_l, + .range_j = fxdiv_init_size_t(range_j), + .tile_range_kl = fxdiv_init_size_t(tile_range_kl), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + }; + thread_function_t parallelize_4d_tile_1d = &thread_parallelize_4d_tile_1d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_4d_tile_1d = &pthreadpool_thread_parallelize_4d_tile_1d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_4d_tile_1d, ¶ms, sizeof(params), + task, argument, tile_range, flags); + } +} + void pthreadpool_parallelize_4d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_t task, @@ -1313,6 +1864,122 @@ void pthreadpool_parallelize_4d_tile_2d_with_uarch( } } +void pthreadpool_parallelize_5d( + pthreadpool_t threadpool, + pthreadpool_task_5d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || (range_i | range_j | range_k | range_l | range_m) <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m++) { + task(argument, i, j, k, l, m); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t range_lm = range_l * range_m; + const size_t range = range_i * range_j * range_k * range_lm; + const struct pthreadpool_5d_params params = { + .range_l = range_l, + .range_j = fxdiv_init_size_t(range_j), + .range_k = fxdiv_init_size_t(range_k), + .range_lm = fxdiv_init_size_t(range_lm), + .range_m = fxdiv_init_size_t(range_m), + }; + thread_function_t parallelize_5d = &thread_parallelize_5d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (range < range_threshold) { + parallelize_5d = &pthreadpool_thread_parallelize_5d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_5d, ¶ms, sizeof(params), + task, argument, range, flags); + } +} + +void pthreadpool_parallelize_5d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_5d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t tile_m, + uint32_t flags) +{ + size_t threads_count; + if (threadpool == NULL || (threads_count = threadpool->threads_count.value) <= 1 || ((range_i | range_j | range_k | range_l) <= 1 && range_m <= tile_m)) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + task(argument, i, j, k, l, m, min(range_m - m, tile_m)); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + const size_t tile_range_m = divide_round_up(range_m, tile_m); + const size_t range_kl = range_k * range_l; + const size_t tile_range = range_i * range_j * range_kl * tile_range_m; + const struct pthreadpool_5d_tile_1d_params params = { + .range_k = range_k, + .range_m = range_m, + .tile_m = tile_m, + .range_j = fxdiv_init_size_t(range_j), + .range_kl = fxdiv_init_size_t(range_kl), + .range_l = fxdiv_init_size_t(range_l), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + }; + thread_function_t parallelize_5d_tile_1d = &thread_parallelize_5d_tile_1d; + #if PTHREADPOOL_USE_FASTPATH + const size_t range_threshold = -threads_count; + if (tile_range < range_threshold) { + parallelize_5d_tile_1d = &pthreadpool_thread_parallelize_5d_tile_1d_fastpath; + } + #endif + pthreadpool_parallelize( + threadpool, parallelize_5d_tile_1d, ¶ms, sizeof(params), + task, argument, tile_range, flags); + } +} + void pthreadpool_parallelize_5d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_5d_tile_2d_t task, |