#ifndef PTHREADPOOL_H_ #define PTHREADPOOL_H_ #include #include typedef struct pthreadpool* pthreadpool_t; typedef void (*pthreadpool_task_1d_t)(void*, size_t); typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_t)(void*, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_tile_1d_t)(void*, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_t)(void*, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_5d_t)(void*, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_5d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_6d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_6d_tile_1d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_1d_with_id_t)(void*, uint32_t, size_t); typedef void (*pthreadpool_task_2d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_3d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t); typedef void (*pthreadpool_task_4d_tile_2d_with_id_t)(void*, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t); /** * Disable support for denormalized numbers to the maximum extent possible for * the duration of the computation. * * Handling denormalized floating-point numbers is often implemented in * microcode, and incurs significant performance degradation. This hint * instructs the thread pool to disable support for denormalized numbers before * running the computation by manipulating architecture-specific control * registers, and restore the initial value of control registers after the * computation is complete. The thread pool temporary disables denormalized * numbers on all threads involved in the computation (i.e. the caller threads, * and potentially worker threads). * * Disabling denormalized numbers may have a small negative effect on results' * accuracy. As various architectures differ in capabilities to control * processing of denormalized numbers, using this flag may also hurt results' * reproducibility across different instruction set architectures. */ #define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001 /** * Yield worker threads to the system scheduler after the operation is finished. * * Force workers to use kernel wait (instead of active spin-wait by default) for * new commands after this command is processed. This flag affects only the * immediate next operation on this thread pool. To make the thread pool always * use kernel wait, pass this flag to all parallelization functions. */ #define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002 #ifdef __cplusplus extern "C" { #endif /** * Create a thread pool with the specified number of threads. * * @param threads_count the number of threads in the thread pool. * A value of 0 has special interpretation: it creates a thread pool with as * many threads as there are logical processors in the system. * * @returns A pointer to an opaque thread pool object if the call is * successful, or NULL pointer if the call failed. */ pthreadpool_t pthreadpool_create(size_t threads_count); /** * Query the number of threads in a thread pool. * * @param threadpool the thread pool to query. * * @returns The number of threads in the thread pool. */ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); /** * Process items on a 1D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range; i++) * function(context, i); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each item. * @param context the first argument passed to the specified function. * @param range the number of items on the 1D grid to process. The * specified function will be called once for each item. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_1d( pthreadpool_t threadpool, pthreadpool_task_1d_t function, void* context, size_t range, uint32_t flags); /** * Process items on a 1D grid using a microarchitecture-aware task function. * * The function implements a parallel version of the following snippet: * * uint32_t uarch_index = cpuinfo_initialize() ? * cpuinfo_get_current_uarch_index() : default_uarch_index; * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; * for (size_t i = 0; i < range; i++) * function(context, uarch_index, i); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If * threadpool is NULL, all items are processed serially on the calling * thread. * @param function the function to call for each item. * @param context the first argument passed to the specified * function. * @param default_uarch_index the microarchitecture index to use when * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, * or index returned by cpuinfo_get_current_uarch_index() exceeds the * max_uarch_index value. * @param max_uarch_index the maximum microarchitecture index expected by * the specified function. If the index returned by * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index * will be used instead. default_uarch_index can exceed max_uarch_index. * @param range the number of items on the 1D grid to process. * The specified function will be called once for each item. * @param flags a bitwise combination of zero or more optional * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_1d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_1d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range, uint32_t flags); /** * Process items on a 1D grid with specified maximum tile size. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range; i += tile) * function(context, i, min(range - i, tile)); * * When the call returns, all items have been processed and the thread pool is * ready for a new task. * * @note If multiple threads call this function with the same thread pool, * the calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range the number of items on the 1D grid to process. * @param tile the maximum number of items on the 1D grid to process in * one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_1d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_1d_tile_1d_t function, void* context, size_t range, size_t tile, uint32_t flags); /** * Process items on a 2D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * function(context, i, j); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each item. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 2D grid. * @param range_j the number of items to process along the second dimension * of the 2D grid. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d( pthreadpool_t threadpool, pthreadpool_task_2d_t function, void* context, size_t range_i, size_t range_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along the * last grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j += tile_j) * function(context, i, j, min(range_j - j, tile_j)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 2D grid. * @param range_j the number of items to process along the second dimension * of the 2D grid. * @param tile_j the maximum number of items along the second dimension of * the 2D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_2d_tile_1d_t function, void* context, size_t range_i, size_t range_j, size_t tile_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along each * grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i += tile_i) * for (size_t j = 0; j < range_j; j += tile_j) * function(context, i, j, * min(range_i - i, tile_i), min(range_j - j, tile_j)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 2D grid. * @param range_j the number of items to process along the second dimension * of the 2D grid. * @param tile_j the maximum number of items along the first dimension of * the 2D grid to process in one function call. * @param tile_j the maximum number of items along the second dimension of * the 2D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_t function, void* context, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags); /** * Process items on a 2D grid with the specified maximum tile size along each * grid dimension using a microarchitecture-aware task function. * * The function implements a parallel version of the following snippet: * * uint32_t uarch_index = cpuinfo_initialize() ? * cpuinfo_get_current_uarch_index() : default_uarch_index; * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; * for (size_t i = 0; i < range_i; i += tile_i) * for (size_t j = 0; j < range_j; j += tile_j) * function(context, uarch_index, i, j, * min(range_i - i, tile_i), min(range_j - j, tile_j)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If * threadpool is NULL, all items are processed serially on the calling * thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified * function. * @param default_uarch_index the microarchitecture index to use when * pthreadpool is configured without cpuinfo, * cpuinfo initialization failed, or index returned * by cpuinfo_get_current_uarch_index() exceeds * the max_uarch_index value. * @param max_uarch_index the maximum microarchitecture index expected * by the specified function. If the index returned * by cpuinfo_get_current_uarch_index() exceeds this * value, default_uarch_index will be used instead. * default_uarch_index can exceed max_uarch_index. * @param range_i the number of items to process along the first * dimension of the 2D grid. * @param range_j the number of items to process along the second * dimension of the 2D grid. * @param tile_j the maximum number of items along the first * dimension of the 2D grid to process in one function call. * @param tile_j the maximum number of items along the second * dimension of the 2D grid to process in one function call. * @param flags a bitwise combination of zero or more optional * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_2d_tile_2d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_2d_tile_2d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j, uint32_t flags); /** * Process items on a 3D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * function(context, i, j, k); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 3D grid. * @param range_j the number of items to process along the second dimension * of the 3D grid. * @param range_k the number of items to process along the third dimension * of the 3D grid. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d( pthreadpool_t threadpool, pthreadpool_task_3d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the * last grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k += tile_k) * function(context, i, j, k, min(range_k - k, tile_k)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 3D grid. * @param range_j the number of items to process along the second dimension * of the 3D grid. * @param range_k the number of items to process along the third dimension * of the 3D grid. * @param tile_k the maximum number of items along the third dimension of * the 3D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_3d_tile_1d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t tile_k, uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the * last two grid dimensions. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j += tile_j) * for (size_t k = 0; k < range_k; k += tile_k) * function(context, i, j, k, * min(range_j - j, tile_j), min(range_k - k, tile_k)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 3D grid. * @param range_j the number of items to process along the second dimension * of the 3D grid. * @param range_k the number of items to process along the third dimension * of the 3D grid. * @param tile_j the maximum number of items along the second dimension of * the 3D grid to process in one function call. * @param tile_k the maximum number of items along the third dimension of * the 3D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t tile_j, size_t tile_k, uint32_t flags); /** * Process items on a 3D grid with the specified maximum tile size along the * last two grid dimensions using a microarchitecture-aware task function. * * The function implements a parallel version of the following snippet: * * uint32_t uarch_index = cpuinfo_initialize() ? * cpuinfo_get_current_uarch_index() : default_uarch_index; * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j += tile_j) * for (size_t k = 0; k < range_k; k += tile_k) * function(context, uarch_index, i, j, k, * min(range_j - j, tile_j), min(range_k - k, tile_k)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If * threadpool is NULL, all items are processed serially on the calling * thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified * function. * @param default_uarch_index the microarchitecture index to use when * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, * or index returned by cpuinfo_get_current_uarch_index() exceeds the * max_uarch_index value. * @param max_uarch_index the maximum microarchitecture index expected by * the specified function. If the index returned by * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index * will be used instead. default_uarch_index can exceed max_uarch_index. * @param range_i the number of items to process along the first * dimension of the 3D grid. * @param range_j the number of items to process along the second * dimension of the 3D grid. * @param range_k the number of items to process along the third * dimension of the 3D grid. * @param tile_j the maximum number of items along the second * dimension of the 3D grid to process in one function call. * @param tile_k the maximum number of items along the third * dimension of the 3D grid to process in one function call. * @param flags a bitwise combination of zero or more optional * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_3d_tile_2d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_3d_tile_2d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, size_t range_j, size_t range_k, size_t tile_j, size_t tile_k, uint32_t flags); /** * Process items on a 4D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * function(context, i, j, k, l); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 4D grid. * @param range_j the number of items to process along the second dimension * of the 4D grid. * @param range_k the number of items to process along the third dimension * of the 4D grid. * @param range_l the number of items to process along the fourth dimension * of the 4D grid. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_4d( pthreadpool_t threadpool, pthreadpool_task_4d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the * last grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l += tile_l) * function(context, i, j, k, l, min(range_l - l, tile_l)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 4D grid. * @param range_j the number of items to process along the second dimension * of the 4D grid. * @param range_k the number of items to process along the third dimension * of the 4D grid. * @param range_l the number of items to process along the fourth dimension * of the 4D grid. * @param tile_l the maximum number of items along the fourth dimension of * the 4D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_4d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_4d_tile_1d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t tile_l, uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the * last two grid dimensions. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k += tile_k) * for (size_t l = 0; l < range_l; l += tile_l) * function(context, i, j, k, l, * min(range_k - k, tile_k), min(range_l - l, tile_l)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 4D grid. * @param range_j the number of items to process along the second dimension * of the 4D grid. * @param range_k the number of items to process along the third dimension * of the 4D grid. * @param range_l the number of items to process along the fourth dimension * of the 4D grid. * @param tile_k the maximum number of items along the third dimension of * the 4D grid to process in one function call. * @param tile_l the maximum number of items along the fourth dimension of * the 4D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_4d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags); /** * Process items on a 4D grid with the specified maximum tile size along the * last two grid dimensions using a microarchitecture-aware task function. * * The function implements a parallel version of the following snippet: * * uint32_t uarch_index = cpuinfo_initialize() ? * cpuinfo_get_current_uarch_index() : default_uarch_index; * if (uarch_index > max_uarch_index) uarch_index = default_uarch_index; * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k += tile_k) * for (size_t l = 0; l < range_l; l += tile_l) * function(context, uarch_index, i, j, k, l, * min(range_k - k, tile_k), min(range_l - l, tile_l)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If * threadpool is NULL, all items are processed serially on the calling * thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified * function. * @param default_uarch_index the microarchitecture index to use when * pthreadpool is configured without cpuinfo, cpuinfo initialization failed, * or index returned by cpuinfo_get_current_uarch_index() exceeds the * max_uarch_index value. * @param max_uarch_index the maximum microarchitecture index expected by * the specified function. If the index returned by * cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index * will be used instead. default_uarch_index can exceed max_uarch_index. * @param range_i the number of items to process along the first * dimension of the 4D grid. * @param range_j the number of items to process along the second * dimension of the 4D grid. * @param range_k the number of items to process along the third * dimension of the 4D grid. * @param range_l the number of items to process along the fourth * dimension of the 4D grid. * @param tile_k the maximum number of items along the third * dimension of the 4D grid to process in one function call. * @param tile_l the maximum number of items along the fourth * dimension of the 4D grid to process in one function call. * @param flags a bitwise combination of zero or more optional * flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or * PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_4d_tile_2d_with_uarch( pthreadpool_t threadpool, pthreadpool_task_4d_tile_2d_with_id_t function, void* context, uint32_t default_uarch_index, uint32_t max_uarch_index, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t tile_k, size_t tile_l, uint32_t flags); /** * Process items on a 5D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * for (size_t m = 0; m < range_m; m++) * function(context, i, j, k, l, m); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 5D grid. * @param range_j the number of items to process along the second dimension * of the 5D grid. * @param range_k the number of items to process along the third dimension * of the 5D grid. * @param range_l the number of items to process along the fourth dimension * of the 5D grid. * @param range_m the number of items to process along the fifth dimension * of the 5D grid. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_5d( pthreadpool_t threadpool, pthreadpool_task_5d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, uint32_t flags); /** * Process items on a 5D grid with the specified maximum tile size along the * last grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * for (size_t m = 0; m < range_m; m += tile_m) * function(context, i, j, k, l, m, min(range_m - m, tile_m)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 5D grid. * @param range_j the number of items to process along the second dimension * of the 5D grid. * @param range_k the number of items to process along the third dimension * of the 5D grid. * @param range_l the number of items to process along the fourth dimension * of the 5D grid. * @param range_m the number of items to process along the fifth dimension * of the 5D grid. * @param tile_m the maximum number of items along the fifth dimension of * the 5D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_5d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_5d_tile_1d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, size_t tile_m, uint32_t flags); /** * Process items on a 5D grid with the specified maximum tile size along the * last two grid dimensions. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l += tile_l) * for (size_t m = 0; m < range_m; m += tile_m) * function(context, i, j, k, l, m, * min(range_l - l, tile_l), min(range_m - m, tile_m)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 5D grid. * @param range_j the number of items to process along the second dimension * of the 5D grid. * @param range_k the number of items to process along the third dimension * of the 5D grid. * @param range_l the number of items to process along the fourth dimension * of the 5D grid. * @param range_m the number of items to process along the fifth dimension * of the 5D grid. * @param tile_l the maximum number of items along the fourth dimension of * the 5D grid to process in one function call. * @param tile_m the maximum number of items along the fifth dimension of * the 5D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_5d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_5d_tile_2d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, size_t tile_l, size_t tile_m, uint32_t flags); /** * Process items on a 6D grid. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * for (size_t m = 0; m < range_m; m++) * for (size_t n = 0; n < range_n; n++) * function(context, i, j, k, l, m, n); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 6D grid. * @param range_j the number of items to process along the second dimension * of the 6D grid. * @param range_k the number of items to process along the third dimension * of the 6D grid. * @param range_l the number of items to process along the fourth dimension * of the 6D grid. * @param range_m the number of items to process along the fifth dimension * of the 6D grid. * @param range_n the number of items to process along the sixth dimension * of the 6D grid. * @param tile_n the maximum number of items along the sixth dimension of * the 6D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_6d( pthreadpool_t threadpool, pthreadpool_task_6d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, size_t range_n, uint32_t flags); /** * Process items on a 6D grid with the specified maximum tile size along the * last grid dimension. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * for (size_t m = 0; m < range_m; m++) * for (size_t n = 0; n < range_n; n += tile_n) * function(context, i, j, k, l, m, n, min(range_n - n, tile_n)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 6D grid. * @param range_j the number of items to process along the second dimension * of the 6D grid. * @param range_k the number of items to process along the third dimension * of the 6D grid. * @param range_l the number of items to process along the fourth dimension * of the 6D grid. * @param range_m the number of items to process along the fifth dimension * of the 6D grid. * @param range_n the number of items to process along the sixth dimension * of the 6D grid. * @param tile_n the maximum number of items along the sixth dimension of * the 6D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_6d_tile_1d( pthreadpool_t threadpool, pthreadpool_task_6d_tile_1d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, size_t range_n, size_t tile_n, uint32_t flags); /** * Process items on a 6D grid with the specified maximum tile size along the * last two grid dimensions. * * The function implements a parallel version of the following snippet: * * for (size_t i = 0; i < range_i; i++) * for (size_t j = 0; j < range_j; j++) * for (size_t k = 0; k < range_k; k++) * for (size_t l = 0; l < range_l; l++) * for (size_t m = 0; m < range_m; m += tile_m) * for (size_t n = 0; n < range_n; n += tile_n) * function(context, i, j, k, l, m, n, * min(range_m - m, tile_m), min(range_n - n, tile_n)); * * When the function returns, all items have been processed and the thread pool * is ready for a new task. * * @note If multiple threads call this function with the same thread pool, the * calls are serialized. * * @param threadpool the thread pool to use for parallelisation. If threadpool * is NULL, all items are processed serially on the calling thread. * @param function the function to call for each tile. * @param context the first argument passed to the specified function. * @param range_i the number of items to process along the first dimension * of the 6D grid. * @param range_j the number of items to process along the second dimension * of the 6D grid. * @param range_k the number of items to process along the third dimension * of the 6D grid. * @param range_l the number of items to process along the fourth dimension * of the 6D grid. * @param range_m the number of items to process along the fifth dimension * of the 6D grid. * @param range_n the number of items to process along the sixth dimension * of the 6D grid. * @param tile_m the maximum number of items along the fifth dimension of * the 6D grid to process in one function call. * @param tile_n the maximum number of items along the sixth dimension of * the 6D grid to process in one function call. * @param flags a bitwise combination of zero or more optional flags * (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS) */ void pthreadpool_parallelize_6d_tile_2d( pthreadpool_t threadpool, pthreadpool_task_6d_tile_2d_t function, void* context, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t range_m, size_t range_n, size_t tile_m, size_t tile_n, uint32_t flags); /** * Terminates threads in the thread pool and releases associated resources. * * @warning Accessing the thread pool after a call to this function constitutes * undefined behaviour and may cause data corruption. * * @param[in,out] threadpool The thread pool to destroy. */ void pthreadpool_destroy(pthreadpool_t threadpool); #ifndef PTHREADPOOL_NO_DEPRECATED_API /* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */ #if defined(__GNUC__) #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) #else #define PTHREADPOOL_DEPRECATED #endif typedef void (*pthreadpool_function_1d_t)(void*, size_t) PTHREADPOOL_DEPRECATED; typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED; typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t) PTHREADPOOL_DEPRECATED; typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_1d( pthreadpool_t threadpool, pthreadpool_function_1d_t function, void* argument, size_t range) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_1d_tiled( pthreadpool_t threadpool, pthreadpool_function_1d_tiled_t function, void* argument, size_t range, size_t tile) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_2d( pthreadpool_t threadpool, pthreadpool_function_2d_t function, void* argument, size_t range_i, size_t range_j) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_2d_tiled( pthreadpool_t threadpool, pthreadpool_function_2d_tiled_t function, void* argument, size_t range_i, size_t range_j, size_t tile_i, size_t tile_j) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_3d_tiled( pthreadpool_t threadpool, pthreadpool_function_3d_tiled_t function, void* argument, size_t range_i, size_t range_j, size_t range_k, size_t tile_i, size_t tile_j, size_t tile_k) PTHREADPOOL_DEPRECATED; void pthreadpool_compute_4d_tiled( pthreadpool_t threadpool, pthreadpool_function_4d_tiled_t function, void* argument, size_t range_i, size_t range_j, size_t range_k, size_t range_l, size_t tile_i, size_t tile_j, size_t tile_k, size_t tile_l) PTHREADPOOL_DEPRECATED; #endif /* PTHREADPOOL_NO_DEPRECATED_API */ #ifdef __cplusplus } /* extern "C" */ #endif #endif /* PTHREADPOOL_H_ */