diff options
Diffstat (limited to 'src/threadpool-pthreads.c')
-rw-r--r-- | src/threadpool-pthreads.c | 538 |
1 files changed, 425 insertions, 113 deletions
diff --git a/src/threadpool-pthreads.c b/src/threadpool-pthreads.c index ea6d6ae..b9b5e01 100644 --- a/src/threadpool-pthreads.c +++ b/src/threadpool-pthreads.c @@ -34,6 +34,9 @@ /* Library header */ #include <pthreadpool.h> +/* Internal headers */ +#include "threadpool-utils.h" + /* Number of iterations in spin-wait loop before going into futex/mutex wait */ #define PTHREADPOOL_SPIN_WAIT_ITERATIONS 1000000 @@ -170,13 +173,17 @@ struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { /** * The function to call for each item. */ - volatile void* function; + volatile void* task; /** * The first argument to the item processing function. */ void *volatile argument; /** - * Serializes concurrent calls to @a pthreadpool_compute_* from different threads. + * Copy of the flags passed to parallelization function. + */ + uint32_t flags; + /** + * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads. */ pthread_mutex_t execution_mutex; #if !PTHREADPOOL_USE_FUTEX @@ -265,13 +272,13 @@ inline static size_t modulo_increment(uint32_t i, uint32_t n) { return i; } -static void thread_compute_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - const pthreadpool_function_1d_t function = (pthreadpool_function_1d_t) threadpool->function; +static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) { + const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) threadpool->task; void *const argument = threadpool->argument; /* Process thread's own range of items */ size_t range_start = thread->range_start; while (atomic_decrement(&thread->range_length)) { - function(argument, range_start++); + task(argument, range_start++); } /* There still may be other threads with work */ @@ -284,7 +291,7 @@ static void thread_compute_1d(struct pthreadpool* threadpool, struct thread_info struct thread_info* other_thread = &threadpool->threads[tid]; while (atomic_decrement(&other_thread->range_length)) { const size_t item_id = __sync_sub_and_fetch(&other_thread->range_end, 1); - function(argument, item_id); + task(argument, item_id); } } } @@ -341,6 +348,7 @@ static void* thread_main(void* arg) { struct thread_info* thread = (struct thread_info*) arg; struct pthreadpool* threadpool = ((struct pthreadpool*) (thread - thread->thread_number)) - 1; uint32_t last_command = threadpool_command_init; + struct fpu_state saved_fpu_state = { 0 }; /* Check in */ checkin_worker_thread(threadpool); @@ -352,8 +360,18 @@ static void* thread_main(void* arg) { /* Process command */ switch (command & THREADPOOL_COMMAND_MASK) { case threadpool_command_compute_1d: - thread_compute_1d(threadpool, thread); + { + const uint32_t flags = threadpool->flags; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + thread_parallelize_1d(threadpool, thread); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } break; + } case threadpool_command_shutdown: /* Exit immediately: the master thread is waiting on pthread_join */ return NULL; @@ -438,16 +456,25 @@ size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { } } -void pthreadpool_compute_1d( +void pthreadpool_parallelize_1d( struct pthreadpool* threadpool, - pthreadpool_function_1d_t function, + pthreadpool_task_1d_t task, void* argument, - size_t range) + size_t range, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range; i++) { - function(argument, i); + task(argument, i); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); } } else { /* Protect the global threadpool structures */ @@ -455,8 +482,9 @@ void pthreadpool_compute_1d( #if PTHREADPOOL_USE_FUTEX /* Setup global arguments */ - threadpool->function = function; + threadpool->task = task; threadpool->argument = argument; + threadpool->flags = flags; threadpool->active_threads = threadpool->threads_count - 1 /* caller thread */; threadpool->has_active_threads = 1; @@ -472,7 +500,7 @@ void pthreadpool_compute_1d( /* * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, function, argument) + * Imporantly, do it after initializing command parameters (range, task, argument) * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask * to ensure the unmasked command is different then the last command, because worker threads * monitor for change in the unmasked command. @@ -486,8 +514,9 @@ void pthreadpool_compute_1d( pthread_mutex_lock(&threadpool->command_mutex); /* Setup global arguments */ - threadpool->function = function; + threadpool->task = task; threadpool->argument = argument; + threadpool->flags = flags; /* Locking of completion_mutex not needed: readers are sleeping on command_condvar */ threadpool->active_threads = threadpool->threads_count - 1 /* caller thread */; @@ -502,7 +531,7 @@ void pthreadpool_compute_1d( /* * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, function, argument) + * Imporantly, do it after initializing command parameters (range, task, argument) * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask * to ensure the unmasked command is different then the last command, because worker threads * monitor for change in the unmasked command. @@ -516,8 +545,20 @@ void pthreadpool_compute_1d( pthread_cond_broadcast(&threadpool->command_condvar); #endif + /* Save and modify FPU denormals control, if needed */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + /* Do computations as worker #0 */ - thread_compute_1d(threadpool, &threadpool->threads[0]); + thread_parallelize_1d(threadpool, &threadpool->threads[0]); + + /* Restore FPU denormals control, if needed */ + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } /* Wait until the threads finish computation */ wait_worker_threads(threadpool); @@ -527,47 +568,56 @@ void pthreadpool_compute_1d( } } -struct compute_1d_tiled_context { - pthreadpool_function_1d_tiled_t function; +struct compute_1d_tile_1d_context { + pthreadpool_task_1d_tile_1d_t task; void* argument; size_t range; size_t tile; }; -static void compute_1d_tiled(const struct compute_1d_tiled_context* context, size_t linear_index) { +static void compute_1d_tile_1d(const struct compute_1d_tile_1d_context* context, size_t linear_index) { const size_t tile_index = linear_index; const size_t index = tile_index * context->tile; const size_t tile = min(context->tile, context->range - index); - context->function(context->argument, index, tile); + context->task(context->argument, index, tile); } -void pthreadpool_compute_1d_tiled( +void pthreadpool_parallelize_1d_tile_1d( pthreadpool_t threadpool, - pthreadpool_function_1d_tiled_t function, + pthreadpool_task_1d_tile_1d_t task, void* argument, size_t range, - size_t tile) + size_t tile, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range; i += tile) { - function(argument, i, min(range - i, tile)); + task(argument, i, min(range - i, tile)); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); } } else { /* Execute in parallel on the thread pool using linearized index */ const size_t tile_range = divide_round_up(range, tile); - struct compute_1d_tiled_context context = { - .function = function, + struct compute_1d_tile_1d_context context = { + .task = task, .argument = argument, .range = range, .tile = tile }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_1d_tile_1d, &context, tile_range, flags); } } struct compute_2d_context { - pthreadpool_function_2d_t function; + pthreadpool_task_2d_t task; void* argument; struct fxdiv_divisor_size_t range_j; }; @@ -575,36 +625,103 @@ struct compute_2d_context { static void compute_2d(const struct compute_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t range_j = context->range_j; const struct fxdiv_result_size_t index = fxdiv_divide_size_t(linear_index, range_j); - context->function(context->argument, index.quotient, index.remainder); + context->task(context->argument, index.quotient, index.remainder); } -void pthreadpool_compute_2d( +void pthreadpool_parallelize_2d( struct pthreadpool* threadpool, - pthreadpool_function_2d_t function, + pthreadpool_task_2d_t task, void* argument, size_t range_i, - size_t range_j) + size_t range_j, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { - function(argument, i, j); + task(argument, i, j); } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ struct compute_2d_context context = { - .function = function, + .task = task, .argument = argument, .range_j = fxdiv_init_size_t(range_j) }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d, &context, range_i * range_j, flags); } } -struct compute_2d_tiled_context { - pthreadpool_function_2d_tiled_t function; +struct compute_2d_tile_1d_context { + pthreadpool_task_2d_tile_1d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_j; + size_t range_i; + size_t range_j; + size_t tile_j; +}; + +static void compute_2d_tile_1d(const struct compute_2d_tile_1d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index = fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t max_tile_j = context->tile_j; + const size_t index_i = tile_index.quotient; + const size_t index_j = tile_index.remainder * max_tile_j; + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + context->task(context->argument, index_i, index_j, tile_j); +} + +void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_j, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + task(argument, i, j, min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_j = divide_round_up(range_j, tile_j); + struct compute_2d_tile_1d_context context = { + .task = task, + .argument = argument, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .range_i = range_i, + .range_j = range_j, + .tile_j = tile_j + }; + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d_tile_1d, &context, range_i * tile_range_j, flags); + } +} + +struct compute_2d_tile_2d_context { + pthreadpool_task_2d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_j; size_t range_i; @@ -613,7 +730,7 @@ struct compute_2d_tiled_context { size_t tile_j; }; -static void compute_2d_tiled(const struct compute_2d_tiled_context* context, size_t linear_index) { +static void compute_2d_tile_2d(const struct compute_2d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; const struct fxdiv_result_size_t tile_index = fxdiv_divide_size_t(linear_index, tile_range_j); const size_t max_tile_i = context->tile_i; @@ -622,31 +739,40 @@ static void compute_2d_tiled(const struct compute_2d_tiled_context* context, siz const size_t index_j = tile_index.remainder * max_tile_j; const size_t tile_i = min(max_tile_i, context->range_i - index_i); const size_t tile_j = min(max_tile_j, context->range_j - index_j); - context->function(context->argument, index_i, index_j, tile_i, tile_j); + context->task(context->argument, index_i, index_j, tile_i, tile_j); } -void pthreadpool_compute_2d_tiled( +void pthreadpool_parallelize_2d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_2d_tiled_t function, + pthreadpool_task_2d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t tile_i, - size_t tile_j) + size_t tile_j, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range_i; i += tile_i) { for (size_t j = 0; j < range_j; j += tile_j) { - function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); + task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ const size_t tile_range_i = divide_round_up(range_i, tile_i); const size_t tile_range_j = divide_round_up(range_j, tile_j); - struct compute_2d_tiled_context context = { - .function = function, + struct compute_2d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_j = fxdiv_init_size_t(tile_range_j), .range_i = range_i, @@ -654,170 +780,356 @@ void pthreadpool_compute_2d_tiled( .tile_i = tile_i, .tile_j = tile_j }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d_tile_2d, &context, tile_range_i * tile_range_j, flags); } } -struct compute_3d_tiled_context { - pthreadpool_function_3d_tiled_t function; +struct compute_3d_tile_2d_context { + pthreadpool_task_3d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_j; struct fxdiv_divisor_size_t tile_range_k; - size_t range_i; size_t range_j; size_t range_k; - size_t tile_i; size_t tile_j; size_t tile_k; }; -static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) { +static void compute_3d_tile_2d(const struct compute_3d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t max_tile_i = context->tile_i; const size_t max_tile_j = context->tile_j; const size_t max_tile_k = context->tile_k; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_i = tile_index_i_j.quotient; const size_t index_j = tile_index_i_j.remainder * max_tile_j; const size_t index_k = tile_index_ij_k.remainder * max_tile_k; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); const size_t tile_j = min(max_tile_j, context->range_j - index_j); const size_t tile_k = min(max_tile_k, context->range_k - index_k); - context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); + context->task(context->argument, index_i, index_j, index_k, tile_j, tile_k); } -void pthreadpool_compute_3d_tiled( +void pthreadpool_parallelize_3d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_3d_tiled_t function, + pthreadpool_task_3d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, - size_t tile_i, size_t tile_j, - size_t tile_k) + size_t tile_k, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { for (size_t k = 0; k < range_k; k += tile_k) { - function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); + task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); } } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); const size_t tile_range_j = divide_round_up(range_j, tile_j); const size_t tile_range_k = divide_round_up(range_k, tile_k); - struct compute_3d_tiled_context context = { - .function = function, + struct compute_3d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_j = fxdiv_init_size_t(tile_range_j), .tile_range_k = fxdiv_init_size_t(tile_range_k), - .range_i = range_i, .range_j = range_j, .range_k = range_k, - .tile_i = tile_i, .tile_j = tile_j, .tile_k = tile_k }; - pthreadpool_compute_1d(threadpool, - (pthreadpool_function_1d_t) compute_3d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k); + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_3d_tile_2d, &context, + range_i * tile_range_j * tile_range_k, flags); } } -struct compute_4d_tiled_context { - pthreadpool_function_4d_tiled_t function; +struct compute_4d_tile_2d_context { + pthreadpool_task_4d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_kl; - struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t range_j; struct fxdiv_divisor_size_t tile_range_l; - size_t range_i; - size_t range_j; size_t range_k; size_t range_l; - size_t tile_i; - size_t tile_j; size_t tile_k; size_t tile_l; }; -static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) { +static void compute_4d_tile_2d(const struct compute_4d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t max_tile_i = context->tile_i; - const size_t max_tile_j = context->tile_j; const size_t max_tile_k = context->tile_k; const size_t max_tile_l = context->tile_l; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; - const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; const size_t index_k = tile_index_k_l.quotient * max_tile_k; const size_t index_l = tile_index_k_l.remainder * max_tile_l; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); - const size_t tile_j = min(max_tile_j, context->range_j - index_j); const size_t tile_k = min(max_tile_k, context->range_k - index_k); const size_t tile_l = min(max_tile_l, context->range_l - index_l); - context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l); + context->task(context->argument, index_i, index_j, index_k, index_l, tile_k, tile_l); } -void pthreadpool_compute_4d_tiled( +void pthreadpool_parallelize_4d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_4d_tiled_t function, + pthreadpool_task_4d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, size_t range_l, - size_t tile_i, - size_t tile_j, size_t tile_k, - size_t tile_l) + size_t tile_l, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { for (size_t k = 0; k < range_k; k += tile_k) { for (size_t l = 0; l < range_l; l += tile_l) { - function(argument, i, j, k, l, - min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); + task(argument, i, j, k, l, + min(range_k - k, tile_k), min(range_l - l, tile_l)); } } } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); const size_t tile_range_k = divide_round_up(range_k, tile_k); const size_t tile_range_l = divide_round_up(range_l, tile_l); - struct compute_4d_tiled_context context = { - .function = function, + struct compute_4d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), - .tile_range_j = fxdiv_init_size_t(tile_range_j), + .range_j = fxdiv_init_size_t(range_j), .tile_range_l = fxdiv_init_size_t(tile_range_l), - .range_i = range_i, - .range_j = range_j, .range_k = range_k, .range_l = range_l, - .tile_i = tile_i, - .tile_j = tile_j, .tile_k = tile_k, .tile_l = tile_l }; - pthreadpool_compute_1d(threadpool, - (pthreadpool_function_1d_t) compute_4d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k * tile_range_l); + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_4d_tile_2d, &context, + range_i * range_j * tile_range_k * tile_range_l, flags); + } +} + +struct compute_5d_tile_2d_context { + pthreadpool_task_5d_tile_2d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_lm; + struct fxdiv_divisor_size_t range_k; + struct fxdiv_divisor_size_t tile_range_m; + struct fxdiv_divisor_size_t range_j; + size_t range_l; + size_t range_m; + size_t tile_l; + size_t tile_m; +}; + +static void compute_5d_tile_2d(const struct compute_5d_tile_2d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_lm = context->tile_range_lm; + const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); + const struct fxdiv_divisor_size_t range_k = context->range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_m = context->tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + + const size_t max_tile_l = context->tile_l; + const size_t max_tile_m = context->tile_m; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; + const size_t index_k = tile_index_ij_k.remainder; + const size_t index_l = tile_index_l_m.quotient * max_tile_l; + const size_t index_m = tile_index_l_m.remainder * max_tile_m; + const size_t tile_l = min(max_tile_l, context->range_l - index_l); + const size_t tile_m = min(max_tile_m, context->range_m - index_m); + context->task(context->argument, index_i, index_j, index_k, index_l, index_m, tile_l, tile_m); +} + +void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_5d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t tile_l, + size_t tile_m, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + for (size_t m = 0; m < range_m; m += tile_m) { + task(argument, i, j, k, l, m, + min(range_l - l, tile_l), min(range_m - m, tile_m)); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_m = divide_round_up(range_m, tile_m); + struct compute_5d_tile_2d_context context = { + .task = task, + .argument = argument, + .tile_range_lm = fxdiv_init_size_t(tile_range_l * tile_range_m), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + .range_j = fxdiv_init_size_t(range_j), + .range_l = range_l, + .range_m = range_m, + .tile_l = tile_l, + .tile_m = tile_m, + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_5d_tile_2d, &context, + range_i * range_j * range_k * tile_range_l * tile_range_m, flags); + } +} + +struct compute_6d_tile_2d_context { + pthreadpool_task_6d_tile_2d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_lmn; + struct fxdiv_divisor_size_t range_k; + struct fxdiv_divisor_size_t tile_range_n; + struct fxdiv_divisor_size_t range_j; + struct fxdiv_divisor_size_t tile_range_m; + size_t range_m; + size_t range_n; + size_t tile_m; + size_t tile_n; +}; + +static void compute_6d_tile_2d(const struct compute_6d_tile_2d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_lmn = context->tile_range_lmn; + const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); + const struct fxdiv_divisor_size_t range_k = context->range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_n = context->tile_range_n; + const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_m = context->tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, tile_range_m); + + const size_t max_tile_m = context->tile_m; + const size_t max_tile_n = context->tile_n; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; + const size_t index_k = tile_index_ij_k.remainder; + const size_t index_l = tile_index_l_m.quotient; + const size_t index_m = tile_index_l_m.remainder * max_tile_m; + const size_t index_n = tile_index_lm_n.remainder * max_tile_n; + const size_t tile_m = min(max_tile_m, context->range_m - index_m); + const size_t tile_n = min(max_tile_n, context->range_n - index_n); + context->task(context->argument, index_i, index_j, index_k, index_l, index_m, index_n, tile_m, tile_n); +} + +void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_6d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t range_n, + size_t tile_m, + size_t tile_n, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + for (size_t n = 0; n < range_n; n += tile_n) { + task(argument, i, j, k, l, m, n, + min(range_m - m, tile_m), min(range_n - n, tile_n)); + } + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_m = divide_round_up(range_m, tile_m); + const size_t tile_range_n = divide_round_up(range_n, tile_n); + struct compute_6d_tile_2d_context context = { + .task = task, + .argument = argument, + .tile_range_lmn = fxdiv_init_size_t(range_l * tile_range_m * tile_range_n), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_n = fxdiv_init_size_t(tile_range_n), + .range_j = fxdiv_init_size_t(range_j), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + .range_m = range_m, + .range_n = range_n, + .tile_m = tile_m, + .tile_n = tile_n, + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_6d_tile_2d, &context, + range_i * range_j * range_k * range_l * tile_range_m * tile_range_n, flags); } } |