diff options
-rw-r--r-- | CMakeLists.txt | 6 | ||||
-rw-r--r-- | bench/latency.cc | 65 | ||||
-rw-r--r-- | bench/throughput.cc | 51 | ||||
-rw-r--r-- | cmake/DownloadGoogleTest.cmake | 4 | ||||
-rwxr-xr-x | configure.py | 7 | ||||
-rw-r--r-- | include/pthreadpool.h | 166 | ||||
-rw-r--r-- | src/threadpool-legacy.c | 235 | ||||
-rw-r--r-- | src/threadpool-pthreads.c | 538 | ||||
-rw-r--r-- | src/threadpool-shim.c | 140 | ||||
-rw-r--r-- | src/threadpool-utils.h | 62 | ||||
-rw-r--r-- | test/pthreadpool.cc | 2871 |
11 files changed, 3863 insertions, 282 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d66c07..0622cb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,10 +65,11 @@ IF(PTHREADPOOL_BUILD_BENCHMARKS AND NOT DEFINED GOOGLEBENCHMARK_SOURCE_DIR) ENDIF() # ---[ pthreadpool library +SET(PTHREADPOOL_SRCS src/threadpool-legacy.c) IF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") - SET(PTHREADPOOL_SRCS src/threadpool-shim.c) + LIST(APPEND PTHREADPOOL_SRCS src/threadpool-shim.c) ELSE() - SET(PTHREADPOOL_SRCS src/threadpool-pthreads.c) + LIST(APPEND PTHREADPOOL_SRCS src/threadpool-pthreads.c) ENDIF() IF(${CMAKE_VERSION} VERSION_LESS "3.0") @@ -92,6 +93,7 @@ ENDIF() PTHREADPOOL_TARGET_ENABLE_C99(pthreadpool) TARGET_LINK_LIBRARIES(pthreadpool PUBLIC pthreadpool_interface) +TARGET_INCLUDE_DIRECTORIES(pthreadpool PRIVATE src) IF(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") SET(CMAKE_THREAD_PREFER_PTHREAD TRUE) IF(NOT CMAKE_GENERATOR STREQUAL "Xcode") diff --git a/bench/latency.cc b/bench/latency.cc index f20a794..f500cdf 100644 --- a/bench/latency.cc +++ b/bench/latency.cc @@ -6,67 +6,88 @@ static void SetNumberOfThreads(benchmark::internal::Benchmark* benchmark) { - const int maxThreads = sysconf(_SC_NPROCESSORS_ONLN); - for (int t = 0; t <= maxThreads; t++) { + const int max_threads = sysconf(_SC_NPROCESSORS_ONLN); + for (int t = 1; t <= max_threads; t++) { benchmark->Arg(t); } } -static void compute_1d(void* context, size_t x) { +static void compute_1d(void*, size_t x) { } -static void pthreadpool_compute_1d(benchmark::State& state) { +static void pthreadpool_parallelize_1d(benchmark::State& state) { const uint32_t threads = static_cast<uint32_t>(state.range(0)); - pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads); + pthreadpool_t threadpool = pthreadpool_create(threads); while (state.KeepRunning()) { - pthreadpool_compute_1d(threadpool, compute_1d, NULL, threads); + pthreadpool_parallelize_1d( + threadpool, + compute_1d, + nullptr /* context */, + threads, + 0 /* flags */); } pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_compute_1d)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_1d)->UseRealTime()->Apply(SetNumberOfThreads); -static void compute_1d_tiled(void* context, size_t x0, size_t xn) { +static void compute_1d_tile_1d(void*, size_t, size_t) { } -static void pthreadpool_compute_1d_tiled(benchmark::State& state) { +static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { const uint32_t threads = static_cast<uint32_t>(state.range(0)); - pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads); + pthreadpool_t threadpool = pthreadpool_create(threads); while (state.KeepRunning()) { - pthreadpool_compute_1d_tiled(threadpool, compute_1d_tiled, NULL, threads, 1); + pthreadpool_parallelize_1d_tile_1d( + threadpool, + compute_1d_tile_1d, + nullptr /* context */, + threads, 1, + 0 /* flags */); } pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_compute_1d_tiled)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_1d_tile_1d)->UseRealTime()->Apply(SetNumberOfThreads); -static void compute_2d(void* context, size_t x, size_t y) { +static void compute_2d(void*, size_t, size_t) { } -static void pthreadpool_compute_2d(benchmark::State& state) { +static void pthreadpool_parallelize_2d(benchmark::State& state) { const uint32_t threads = static_cast<uint32_t>(state.range(0)); - pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads); + pthreadpool_t threadpool = pthreadpool_create(threads); while (state.KeepRunning()) { - pthreadpool_compute_2d(threadpool, compute_2d, NULL, 1, threads); + pthreadpool_parallelize_2d( + threadpool, + compute_2d, + nullptr /* context */, + 1, threads, + 0 /* flags */); } pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_compute_2d)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_2d)->UseRealTime()->Apply(SetNumberOfThreads); -static void compute_2d_tiled(void* context, size_t x0, size_t y0, size_t xn, size_t yn) { +static void compute_2d_tile_2d(void*, size_t, size_t, size_t, size_t) { } -static void pthreadpool_compute_2d_tiled(benchmark::State& state) { +static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { const uint32_t threads = static_cast<uint32_t>(state.range(0)); - pthreadpool_t threadpool = threads == 0 ? NULL : pthreadpool_create(threads); + pthreadpool_t threadpool = pthreadpool_create(threads); while (state.KeepRunning()) { - pthreadpool_compute_2d_tiled(threadpool, compute_2d_tiled, NULL, 1, threads, 1, 1); + pthreadpool_parallelize_2d_tile_2d( + threadpool, + compute_2d_tile_2d, + nullptr /* context */, + 1, threads, + 1, 1, + 0 /* flags */); } pthreadpool_destroy(threadpool); } -BENCHMARK(pthreadpool_compute_2d_tiled)->UseRealTime()->Apply(SetNumberOfThreads); +BENCHMARK(pthreadpool_parallelize_2d_tile_2d)->UseRealTime()->Apply(SetNumberOfThreads); BENCHMARK_MAIN(); diff --git a/bench/throughput.cc b/bench/throughput.cc index cef3442..2242ccb 100644 --- a/bench/throughput.cc +++ b/bench/throughput.cc @@ -1,80 +1,99 @@ #include <benchmark/benchmark.h> -#include <unistd.h> - #include <pthreadpool.h> -static void compute_1d(void* context, size_t x) { +static void compute_1d(void*, size_t) { } -static void pthreadpool_compute_1d(benchmark::State& state) { +static void pthreadpool_parallelize_1d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(0); const size_t threads = pthreadpool_get_threads_count(threadpool); const size_t items = static_cast<size_t>(state.range(0)); while (state.KeepRunning()) { - pthreadpool_compute_1d(threadpool, compute_1d, NULL, items * threads); + pthreadpool_parallelize_1d( + threadpool, + compute_1d, + nullptr /* context */, + items * threads, + 0 /* flags */); } pthreadpool_destroy(threadpool); /* Do not normalize by thread */ state.SetItemsProcessed(int64_t(state.iterations()) * items); } -BENCHMARK(pthreadpool_compute_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); -static void compute_1d_tiled(void* context, size_t x0, size_t xn) { +static void compute_1d_tile_1d(void*, size_t, size_t) { } -static void pthreadpool_compute_1d_tiled(benchmark::State& state) { +static void pthreadpool_parallelize_1d_tile_1d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(0); const size_t threads = pthreadpool_get_threads_count(threadpool); const size_t items = static_cast<size_t>(state.range(0)); while (state.KeepRunning()) { - pthreadpool_compute_1d_tiled(threadpool, compute_1d_tiled, NULL, items * threads, 1); + pthreadpool_parallelize_1d_tile_1d( + threadpool, + compute_1d_tile_1d, + nullptr /* context */, + items * threads, 1, + 0 /* flags */); } pthreadpool_destroy(threadpool); /* Do not normalize by thread */ state.SetItemsProcessed(int64_t(state.iterations()) * items); } -BENCHMARK(pthreadpool_compute_1d_tiled)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_1d_tile_1d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); static void compute_2d(void* context, size_t x, size_t y) { } -static void pthreadpool_compute_2d(benchmark::State& state) { +static void pthreadpool_parallelize_2d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(0); const size_t threads = pthreadpool_get_threads_count(threadpool); const size_t items = static_cast<size_t>(state.range(0)); while (state.KeepRunning()) { - pthreadpool_compute_2d(threadpool, compute_2d, NULL, threads, items); + pthreadpool_parallelize_2d( + threadpool, + compute_2d, + nullptr /* context */, + threads, items, + 0 /* flags */); } pthreadpool_destroy(threadpool); /* Do not normalize by thread */ state.SetItemsProcessed(int64_t(state.iterations()) * items); } -BENCHMARK(pthreadpool_compute_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); static void compute_2d_tiled(void* context, size_t x0, size_t y0, size_t xn, size_t yn) { } -static void pthreadpool_compute_2d_tiled(benchmark::State& state) { +static void pthreadpool_parallelize_2d_tile_2d(benchmark::State& state) { pthreadpool_t threadpool = pthreadpool_create(0); const size_t threads = pthreadpool_get_threads_count(threadpool); const size_t items = static_cast<size_t>(state.range(0)); while (state.KeepRunning()) { - pthreadpool_compute_2d_tiled(threadpool, compute_2d_tiled, NULL, threads, items, 1, 1); + pthreadpool_parallelize_2d_tile_2d( + threadpool, + compute_2d_tiled, + nullptr /* context */, + threads, items, + 1, 1, + 0 /* flags */); } pthreadpool_destroy(threadpool); /* Do not normalize by thread */ state.SetItemsProcessed(int64_t(state.iterations()) * items); } -BENCHMARK(pthreadpool_compute_2d_tiled)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); +BENCHMARK(pthreadpool_parallelize_2d_tile_2d)->UseRealTime()->RangeMultiplier(10)->Range(10, 1000000); BENCHMARK_MAIN(); diff --git a/cmake/DownloadGoogleTest.cmake b/cmake/DownloadGoogleTest.cmake index d69d19a..40d66bc 100644 --- a/cmake/DownloadGoogleTest.cmake +++ b/cmake/DownloadGoogleTest.cmake @@ -4,8 +4,8 @@ PROJECT(googletest-download NONE) INCLUDE(ExternalProject) ExternalProject_Add(googletest - URL https://github.com/google/googletest/archive/release-1.8.0.zip - URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf + URL https://github.com/google/googletest/archive/dc1ca9ae4c206434e450ed4ff535ca7c20c79e3c.zip + URL_HASH SHA256=d8376d6283e15ffd317646052233c88e2044cd61453619315e6fc139dc1b5d76 SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest" BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest" CONFIGURE_COMMAND "" diff --git a/configure.py b/configure.py index 75fa736..e0f0992 100755 --- a/configure.py +++ b/configure.py @@ -12,11 +12,12 @@ def main(args): build.export_cpath("include", ["pthreadpool.h"]) with build.options(source_dir="src", extra_include_dirs="src", deps=build.deps.fxdiv): + sources = ["threadpool-legacy.c"] if build.target.is_emscripten: - source = "threadpool-shim.c" + source.append("threadpool-shim.c") else: - source = "threadpool-pthreads.c" - build.static_library("pthreadpool", build.cc(source)) + source.append("threadpool-pthreads.c") + build.static_library("pthreadpool", [build.cc(src) for src in sources]) with build.options(source_dir="test", deps=[build, build.deps.googletest]): build.unittest("pthreadpool-test", build.cxx("pthreadpool.cc")) diff --git a/include/pthreadpool.h b/include/pthreadpool.h index a99105e..9f7f4b3 100644 --- a/include/pthreadpool.h +++ b/include/pthreadpool.h @@ -1,16 +1,23 @@ -#include <stddef.h> +#ifndef PTHREADPOOL_H_ +#define PTHREADPOOL_H_ -#ifndef PTHREADPOOL_H -#define PTHREADPOOL_H +#include <stddef.h> +#include <stdint.h> typedef struct pthreadpool* pthreadpool_t; -typedef void (*pthreadpool_function_1d_t)(void*, size_t); -typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); -typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t); -typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); -typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_1d_t)(void*, size_t); +typedef void (*pthreadpool_task_1d_tile_1d_t)(void*, size_t, size_t); +typedef void (*pthreadpool_task_2d_t)(void*, size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_1d_t)(void*, size_t, size_t, size_t); +typedef void (*pthreadpool_task_2d_tile_2d_t)(void*, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_3d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_4d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_5d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t); +typedef void (*pthreadpool_task_6d_tile_2d_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); + + +#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001 #ifdef __cplusplus extern "C" { @@ -37,7 +44,6 @@ pthreadpool_t pthreadpool_create(size_t threads_count); */ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); - /** * Processes items in parallel using threads from a thread pool. * @@ -53,27 +59,143 @@ size_t pthreadpool_get_threads_count(pthreadpool_t threadpool); * @param[in] items The number of items to process. The @a function * will be called once for each item. */ -void pthreadpool_compute_1d( +void pthreadpool_parallelize_1d( + pthreadpool_t threadpool, + pthreadpool_task_1d_t function, + void* argument, + size_t range, + uint32_t flags); + +void pthreadpool_parallelize_1d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_1d_tile_1d_t function, + void* argument, + size_t range, + size_t tile, + uint32_t flags); + +void pthreadpool_parallelize_2d( + pthreadpool_t threadpool, + pthreadpool_task_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + uint32_t flags); + +void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_j, + uint32_t flags); + +void pthreadpool_parallelize_2d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_i, + size_t tile_j, + uint32_t flags); + +void pthreadpool_parallelize_3d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_3d_tile_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t tile_j, + size_t tile_k, + uint32_t flags); + +void pthreadpool_parallelize_4d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_4d_tile_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t tile_k, + size_t tile_l, + uint32_t flags); + +void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_5d_tile_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t tile_l, + size_t tile_m, + uint32_t flags); + +void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_6d_tile_2d_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t range_n, + size_t tile_m, + size_t tile_n, + uint32_t flags); + +/** + * Terminates threads in the thread pool and releases associated resources. + * + * @warning Accessing the thread pool after a call to this function constitutes + * undefined behaviour and may cause data corruption. + * + * @param[in,out] threadpool The thread pool to destroy. + */ +void pthreadpool_destroy(pthreadpool_t threadpool); + +/* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */ +#if defined(__GNUC__) + #define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__)) +#else + #define PTHREADPOOL_DEPRECATED +#endif + +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_1d_t)(void*, size_t); +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t); +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_2d_t)(void*, size_t, size_t); +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t); +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t); +typedef PTHREADPOOL_DEPRECATED void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t); + +PTHREADPOOL_DEPRECATED void pthreadpool_compute_1d( pthreadpool_t threadpool, pthreadpool_function_1d_t function, void* argument, size_t range); -void pthreadpool_compute_1d_tiled( +PTHREADPOOL_DEPRECATED void pthreadpool_compute_1d_tiled( pthreadpool_t threadpool, pthreadpool_function_1d_tiled_t function, void* argument, size_t range, size_t tile); -void pthreadpool_compute_2d( +PTHREADPOOL_DEPRECATED void pthreadpool_compute_2d( pthreadpool_t threadpool, pthreadpool_function_2d_t function, void* argument, size_t range_i, size_t range_j); -void pthreadpool_compute_2d_tiled( +PTHREADPOOL_DEPRECATED void pthreadpool_compute_2d_tiled( pthreadpool_t threadpool, pthreadpool_function_2d_tiled_t function, void* argument, @@ -82,7 +204,7 @@ void pthreadpool_compute_2d_tiled( size_t tile_i, size_t tile_j); -void pthreadpool_compute_3d_tiled( +PTHREADPOOL_DEPRECATED void pthreadpool_compute_3d_tiled( pthreadpool_t threadpool, pthreadpool_function_3d_tiled_t function, void* argument, @@ -93,7 +215,7 @@ void pthreadpool_compute_3d_tiled( size_t tile_j, size_t tile_k); -void pthreadpool_compute_4d_tiled( +PTHREADPOOL_DEPRECATED void pthreadpool_compute_4d_tiled( pthreadpool_t threadpool, pthreadpool_function_4d_tiled_t function, void* argument, @@ -106,18 +228,8 @@ void pthreadpool_compute_4d_tiled( size_t tile_k, size_t tile_l); -/** - * Terminates threads in the thread pool and releases associated resources. - * - * @warning Accessing the thread pool after a call to this function constitutes - * undefined behaviour and may cause data corruption. - * - * @param[in,out] threadpool The thread pool to destroy. - */ -void pthreadpool_destroy(pthreadpool_t threadpool); - #ifdef __cplusplus } /* extern "C" */ #endif -#endif /* PTHREADPOOL_H */ +#endif /* PTHREADPOOL_H_ */ diff --git a/src/threadpool-legacy.c b/src/threadpool-legacy.c new file mode 100644 index 0000000..43fb798 --- /dev/null +++ b/src/threadpool-legacy.c @@ -0,0 +1,235 @@ +/* Standard C headers */ +#include <stddef.h> + +/* Dependencies */ +#include <fxdiv.h> + +/* Library header */ +#include <pthreadpool.h> + + +static inline size_t divide_round_up(size_t dividend, size_t divisor) { + if (dividend % divisor == 0) { + return dividend / divisor; + } else { + return dividend / divisor + 1; + } +} + +static inline size_t min(size_t a, size_t b) { + return a < b ? a : b; +} + +void pthreadpool_compute_1d( + pthreadpool_t threadpool, + pthreadpool_function_1d_t function, + void* argument, + size_t range) +{ + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) function, argument, + range, 0 /* flags */); +} + +void pthreadpool_compute_1d_tiled( + pthreadpool_t threadpool, + pthreadpool_function_1d_tiled_t function, + void* argument, + size_t range, + size_t tile) +{ + pthreadpool_parallelize_1d_tile_1d(threadpool, + (pthreadpool_task_1d_tile_1d_t) function, argument, + range, tile, 0 /* flags */); +} + +void pthreadpool_compute_2d( + pthreadpool_t threadpool, + pthreadpool_function_2d_t function, + void* argument, + size_t range_i, + size_t range_j) +{ + pthreadpool_parallelize_2d(threadpool, + (pthreadpool_task_2d_t) function, argument, + range_i, range_j, 0 /* flags */); +} + +void pthreadpool_compute_2d_tiled( + pthreadpool_t threadpool, + pthreadpool_function_2d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_i, + size_t tile_j) +{ + pthreadpool_parallelize_2d_tile_2d(threadpool, + (pthreadpool_task_2d_tile_2d_t) function, argument, + range_i, range_j, tile_i, tile_j, 0 /* flags */); +} + +struct compute_3d_tiled_context { + pthreadpool_function_3d_tiled_t function; + void* argument; + struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t tile_range_k; + size_t range_i; + size_t range_j; + size_t range_k; + size_t tile_i; + size_t tile_j; + size_t tile_k; +}; + +static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); + const size_t max_tile_i = context->tile_i; + const size_t max_tile_j = context->tile_j; + const size_t max_tile_k = context->tile_k; + const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_k = tile_index_ij_k.remainder * max_tile_k; + const size_t tile_i = min(max_tile_i, context->range_i - index_i); + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + const size_t tile_k = min(max_tile_k, context->range_k - index_k); + context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); +} + +void pthreadpool_compute_3d_tiled( + pthreadpool_t threadpool, + pthreadpool_function_3d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t tile_i, + size_t tile_j, + size_t tile_k) +{ + if (pthreadpool_get_threads_count(threadpool) <= 1) { + /* No thread pool used: execute function sequentially on the calling thread */ + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); + } + } + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + struct compute_3d_tiled_context context = { + .function = function, + .argument = argument, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_k = fxdiv_init_size_t(tile_range_k), + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .tile_i = tile_i, + .tile_j = tile_j, + .tile_k = tile_k + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_3d_tiled, &context, + tile_range_i * tile_range_j * tile_range_k, + 0 /* flags */); + } +} + +struct compute_4d_tiled_context { + pthreadpool_function_4d_tiled_t function; + void* argument; + struct fxdiv_divisor_size_t tile_range_kl; + struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t tile_range_l; + size_t range_i; + size_t range_j; + size_t range_k; + size_t range_l; + size_t tile_i; + size_t tile_j; + size_t tile_k; + size_t tile_l; +}; + +static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; + const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); + const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; + const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); + const size_t max_tile_i = context->tile_i; + const size_t max_tile_j = context->tile_j; + const size_t max_tile_k = context->tile_k; + const size_t max_tile_l = context->tile_l; + const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_k = tile_index_k_l.quotient * max_tile_k; + const size_t index_l = tile_index_k_l.remainder * max_tile_l; + const size_t tile_i = min(max_tile_i, context->range_i - index_i); + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + const size_t tile_k = min(max_tile_k, context->range_k - index_k); + const size_t tile_l = min(max_tile_l, context->range_l - index_l); + context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l); +} + +void pthreadpool_compute_4d_tiled( + pthreadpool_t threadpool, + pthreadpool_function_4d_tiled_t function, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t tile_i, + size_t tile_j, + size_t tile_k, + size_t tile_l) +{ + if (pthreadpool_get_threads_count(threadpool) <= 1) { + /* No thread pool used: execute function sequentially on the calling thread */ + for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t k = 0; k < range_k; k += tile_k) { + for (size_t l = 0; l < range_l; l += tile_l) { + function(argument, i, j, k, l, + min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); + } + } + } + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_i = divide_round_up(range_i, tile_i); + const size_t tile_range_j = divide_round_up(range_j, tile_j); + const size_t tile_range_k = divide_round_up(range_k, tile_k); + const size_t tile_range_l = divide_round_up(range_l, tile_l); + struct compute_4d_tiled_context context = { + .function = function, + .argument = argument, + .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .tile_range_l = fxdiv_init_size_t(tile_range_l), + .range_i = range_i, + .range_j = range_j, + .range_k = range_k, + .range_l = range_l, + .tile_i = tile_i, + .tile_j = tile_j, + .tile_k = tile_k, + .tile_l = tile_l + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_4d_tiled, &context, + tile_range_i * tile_range_j * tile_range_k * tile_range_l, + 0 /* flags */); + } +} diff --git a/src/threadpool-pthreads.c b/src/threadpool-pthreads.c index ea6d6ae..b9b5e01 100644 --- a/src/threadpool-pthreads.c +++ b/src/threadpool-pthreads.c @@ -34,6 +34,9 @@ /* Library header */ #include <pthreadpool.h> +/* Internal headers */ +#include "threadpool-utils.h" + /* Number of iterations in spin-wait loop before going into futex/mutex wait */ #define PTHREADPOOL_SPIN_WAIT_ITERATIONS 1000000 @@ -170,13 +173,17 @@ struct PTHREADPOOL_CACHELINE_ALIGNED pthreadpool { /** * The function to call for each item. */ - volatile void* function; + volatile void* task; /** * The first argument to the item processing function. */ void *volatile argument; /** - * Serializes concurrent calls to @a pthreadpool_compute_* from different threads. + * Copy of the flags passed to parallelization function. + */ + uint32_t flags; + /** + * Serializes concurrent calls to @a pthreadpool_parallelize_* from different threads. */ pthread_mutex_t execution_mutex; #if !PTHREADPOOL_USE_FUTEX @@ -265,13 +272,13 @@ inline static size_t modulo_increment(uint32_t i, uint32_t n) { return i; } -static void thread_compute_1d(struct pthreadpool* threadpool, struct thread_info* thread) { - const pthreadpool_function_1d_t function = (pthreadpool_function_1d_t) threadpool->function; +static void thread_parallelize_1d(struct pthreadpool* threadpool, struct thread_info* thread) { + const pthreadpool_task_1d_t task = (pthreadpool_task_1d_t) threadpool->task; void *const argument = threadpool->argument; /* Process thread's own range of items */ size_t range_start = thread->range_start; while (atomic_decrement(&thread->range_length)) { - function(argument, range_start++); + task(argument, range_start++); } /* There still may be other threads with work */ @@ -284,7 +291,7 @@ static void thread_compute_1d(struct pthreadpool* threadpool, struct thread_info struct thread_info* other_thread = &threadpool->threads[tid]; while (atomic_decrement(&other_thread->range_length)) { const size_t item_id = __sync_sub_and_fetch(&other_thread->range_end, 1); - function(argument, item_id); + task(argument, item_id); } } } @@ -341,6 +348,7 @@ static void* thread_main(void* arg) { struct thread_info* thread = (struct thread_info*) arg; struct pthreadpool* threadpool = ((struct pthreadpool*) (thread - thread->thread_number)) - 1; uint32_t last_command = threadpool_command_init; + struct fpu_state saved_fpu_state = { 0 }; /* Check in */ checkin_worker_thread(threadpool); @@ -352,8 +360,18 @@ static void* thread_main(void* arg) { /* Process command */ switch (command & THREADPOOL_COMMAND_MASK) { case threadpool_command_compute_1d: - thread_compute_1d(threadpool, thread); + { + const uint32_t flags = threadpool->flags; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + thread_parallelize_1d(threadpool, thread); + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } break; + } case threadpool_command_shutdown: /* Exit immediately: the master thread is waiting on pthread_join */ return NULL; @@ -438,16 +456,25 @@ size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { } } -void pthreadpool_compute_1d( +void pthreadpool_parallelize_1d( struct pthreadpool* threadpool, - pthreadpool_function_1d_t function, + pthreadpool_task_1d_t task, void* argument, - size_t range) + size_t range, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range; i++) { - function(argument, i); + task(argument, i); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); } } else { /* Protect the global threadpool structures */ @@ -455,8 +482,9 @@ void pthreadpool_compute_1d( #if PTHREADPOOL_USE_FUTEX /* Setup global arguments */ - threadpool->function = function; + threadpool->task = task; threadpool->argument = argument; + threadpool->flags = flags; threadpool->active_threads = threadpool->threads_count - 1 /* caller thread */; threadpool->has_active_threads = 1; @@ -472,7 +500,7 @@ void pthreadpool_compute_1d( /* * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, function, argument) + * Imporantly, do it after initializing command parameters (range, task, argument) * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask * to ensure the unmasked command is different then the last command, because worker threads * monitor for change in the unmasked command. @@ -486,8 +514,9 @@ void pthreadpool_compute_1d( pthread_mutex_lock(&threadpool->command_mutex); /* Setup global arguments */ - threadpool->function = function; + threadpool->task = task; threadpool->argument = argument; + threadpool->flags = flags; /* Locking of completion_mutex not needed: readers are sleeping on command_condvar */ threadpool->active_threads = threadpool->threads_count - 1 /* caller thread */; @@ -502,7 +531,7 @@ void pthreadpool_compute_1d( /* * Update the threadpool command. - * Imporantly, do it after initializing command parameters (range, function, argument) + * Imporantly, do it after initializing command parameters (range, task, argument) * ~(threadpool->command | THREADPOOL_COMMAND_MASK) flips the bits not in command mask * to ensure the unmasked command is different then the last command, because worker threads * monitor for change in the unmasked command. @@ -516,8 +545,20 @@ void pthreadpool_compute_1d( pthread_cond_broadcast(&threadpool->command_condvar); #endif + /* Save and modify FPU denormals control, if needed */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + /* Do computations as worker #0 */ - thread_compute_1d(threadpool, &threadpool->threads[0]); + thread_parallelize_1d(threadpool, &threadpool->threads[0]); + + /* Restore FPU denormals control, if needed */ + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } /* Wait until the threads finish computation */ wait_worker_threads(threadpool); @@ -527,47 +568,56 @@ void pthreadpool_compute_1d( } } -struct compute_1d_tiled_context { - pthreadpool_function_1d_tiled_t function; +struct compute_1d_tile_1d_context { + pthreadpool_task_1d_tile_1d_t task; void* argument; size_t range; size_t tile; }; -static void compute_1d_tiled(const struct compute_1d_tiled_context* context, size_t linear_index) { +static void compute_1d_tile_1d(const struct compute_1d_tile_1d_context* context, size_t linear_index) { const size_t tile_index = linear_index; const size_t index = tile_index * context->tile; const size_t tile = min(context->tile, context->range - index); - context->function(context->argument, index, tile); + context->task(context->argument, index, tile); } -void pthreadpool_compute_1d_tiled( +void pthreadpool_parallelize_1d_tile_1d( pthreadpool_t threadpool, - pthreadpool_function_1d_tiled_t function, + pthreadpool_task_1d_tile_1d_t task, void* argument, size_t range, - size_t tile) + size_t tile, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range; i += tile) { - function(argument, i, min(range - i, tile)); + task(argument, i, min(range - i, tile)); + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); } } else { /* Execute in parallel on the thread pool using linearized index */ const size_t tile_range = divide_round_up(range, tile); - struct compute_1d_tiled_context context = { - .function = function, + struct compute_1d_tile_1d_context context = { + .task = task, .argument = argument, .range = range, .tile = tile }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_1d_tiled, &context, tile_range); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_1d_tile_1d, &context, tile_range, flags); } } struct compute_2d_context { - pthreadpool_function_2d_t function; + pthreadpool_task_2d_t task; void* argument; struct fxdiv_divisor_size_t range_j; }; @@ -575,36 +625,103 @@ struct compute_2d_context { static void compute_2d(const struct compute_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t range_j = context->range_j; const struct fxdiv_result_size_t index = fxdiv_divide_size_t(linear_index, range_j); - context->function(context->argument, index.quotient, index.remainder); + context->task(context->argument, index.quotient, index.remainder); } -void pthreadpool_compute_2d( +void pthreadpool_parallelize_2d( struct pthreadpool* threadpool, - pthreadpool_function_2d_t function, + pthreadpool_task_2d_t task, void* argument, size_t range_i, - size_t range_j) + size_t range_j, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { - function(argument, i, j); + task(argument, i, j); } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ struct compute_2d_context context = { - .function = function, + .task = task, .argument = argument, .range_j = fxdiv_init_size_t(range_j) }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d, &context, range_i * range_j); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d, &context, range_i * range_j, flags); } } -struct compute_2d_tiled_context { - pthreadpool_function_2d_tiled_t function; +struct compute_2d_tile_1d_context { + pthreadpool_task_2d_tile_1d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_j; + size_t range_i; + size_t range_j; + size_t tile_j; +}; + +static void compute_2d_tile_1d(const struct compute_2d_tile_1d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; + const struct fxdiv_result_size_t tile_index = fxdiv_divide_size_t(linear_index, tile_range_j); + const size_t max_tile_j = context->tile_j; + const size_t index_i = tile_index.quotient; + const size_t index_j = tile_index.remainder * max_tile_j; + const size_t tile_j = min(max_tile_j, context->range_j - index_j); + context->task(context->argument, index_i, index_j, tile_j); +} + +void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_j, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + task(argument, i, j, min(range_j - j, tile_j)); + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_j = divide_round_up(range_j, tile_j); + struct compute_2d_tile_1d_context context = { + .task = task, + .argument = argument, + .tile_range_j = fxdiv_init_size_t(tile_range_j), + .range_i = range_i, + .range_j = range_j, + .tile_j = tile_j + }; + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d_tile_1d, &context, range_i * tile_range_j, flags); + } +} + +struct compute_2d_tile_2d_context { + pthreadpool_task_2d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_j; size_t range_i; @@ -613,7 +730,7 @@ struct compute_2d_tiled_context { size_t tile_j; }; -static void compute_2d_tiled(const struct compute_2d_tiled_context* context, size_t linear_index) { +static void compute_2d_tile_2d(const struct compute_2d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; const struct fxdiv_result_size_t tile_index = fxdiv_divide_size_t(linear_index, tile_range_j); const size_t max_tile_i = context->tile_i; @@ -622,31 +739,40 @@ static void compute_2d_tiled(const struct compute_2d_tiled_context* context, siz const size_t index_j = tile_index.remainder * max_tile_j; const size_t tile_i = min(max_tile_i, context->range_i - index_i); const size_t tile_j = min(max_tile_j, context->range_j - index_j); - context->function(context->argument, index_i, index_j, tile_i, tile_j); + context->task(context->argument, index_i, index_j, tile_i, tile_j); } -void pthreadpool_compute_2d_tiled( +void pthreadpool_parallelize_2d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_2d_tiled_t function, + pthreadpool_task_2d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t tile_i, - size_t tile_j) + size_t tile_j, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } for (size_t i = 0; i < range_i; i += tile_i) { for (size_t j = 0; j < range_j; j += tile_j) { - function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); + task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ const size_t tile_range_i = divide_round_up(range_i, tile_i); const size_t tile_range_j = divide_round_up(range_j, tile_j); - struct compute_2d_tiled_context context = { - .function = function, + struct compute_2d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_j = fxdiv_init_size_t(tile_range_j), .range_i = range_i, @@ -654,170 +780,356 @@ void pthreadpool_compute_2d_tiled( .tile_i = tile_i, .tile_j = tile_j }; - pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j); + pthreadpool_parallelize_1d(threadpool, (pthreadpool_task_1d_t) compute_2d_tile_2d, &context, tile_range_i * tile_range_j, flags); } } -struct compute_3d_tiled_context { - pthreadpool_function_3d_tiled_t function; +struct compute_3d_tile_2d_context { + pthreadpool_task_3d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_j; struct fxdiv_divisor_size_t tile_range_k; - size_t range_i; size_t range_j; size_t range_k; - size_t tile_i; size_t tile_j; size_t tile_k; }; -static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) { +static void compute_3d_tile_2d(const struct compute_3d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k; const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k); const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j); - const size_t max_tile_i = context->tile_i; const size_t max_tile_j = context->tile_j; const size_t max_tile_k = context->tile_k; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; + const size_t index_i = tile_index_i_j.quotient; const size_t index_j = tile_index_i_j.remainder * max_tile_j; const size_t index_k = tile_index_ij_k.remainder * max_tile_k; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); const size_t tile_j = min(max_tile_j, context->range_j - index_j); const size_t tile_k = min(max_tile_k, context->range_k - index_k); - context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k); + context->task(context->argument, index_i, index_j, index_k, tile_j, tile_k); } -void pthreadpool_compute_3d_tiled( +void pthreadpool_parallelize_3d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_3d_tiled_t function, + pthreadpool_task_3d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, - size_t tile_i, size_t tile_j, - size_t tile_k) + size_t tile_k, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { for (size_t k = 0; k < range_k; k += tile_k) { - function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); + task(argument, i, j, k, min(range_j - j, tile_j), min(range_k - k, tile_k)); } } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); const size_t tile_range_j = divide_round_up(range_j, tile_j); const size_t tile_range_k = divide_round_up(range_k, tile_k); - struct compute_3d_tiled_context context = { - .function = function, + struct compute_3d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_j = fxdiv_init_size_t(tile_range_j), .tile_range_k = fxdiv_init_size_t(tile_range_k), - .range_i = range_i, .range_j = range_j, .range_k = range_k, - .tile_i = tile_i, .tile_j = tile_j, .tile_k = tile_k }; - pthreadpool_compute_1d(threadpool, - (pthreadpool_function_1d_t) compute_3d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k); + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_3d_tile_2d, &context, + range_i * tile_range_j * tile_range_k, flags); } } -struct compute_4d_tiled_context { - pthreadpool_function_4d_tiled_t function; +struct compute_4d_tile_2d_context { + pthreadpool_task_4d_tile_2d_t task; void* argument; struct fxdiv_divisor_size_t tile_range_kl; - struct fxdiv_divisor_size_t tile_range_j; + struct fxdiv_divisor_size_t range_j; struct fxdiv_divisor_size_t tile_range_l; - size_t range_i; - size_t range_j; size_t range_k; size_t range_l; - size_t tile_i; - size_t tile_j; size_t tile_k; size_t tile_l; }; -static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) { +static void compute_4d_tile_2d(const struct compute_4d_tile_2d_context* context, size_t linear_index) { const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl; const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl); - const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j; - const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, range_j); const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l; const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l); - const size_t max_tile_i = context->tile_i; - const size_t max_tile_j = context->tile_j; const size_t max_tile_k = context->tile_k; const size_t max_tile_l = context->tile_l; - const size_t index_i = tile_index_i_j.quotient * max_tile_i; - const size_t index_j = tile_index_i_j.remainder * max_tile_j; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; const size_t index_k = tile_index_k_l.quotient * max_tile_k; const size_t index_l = tile_index_k_l.remainder * max_tile_l; - const size_t tile_i = min(max_tile_i, context->range_i - index_i); - const size_t tile_j = min(max_tile_j, context->range_j - index_j); const size_t tile_k = min(max_tile_k, context->range_k - index_k); const size_t tile_l = min(max_tile_l, context->range_l - index_l); - context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l); + context->task(context->argument, index_i, index_j, index_k, index_l, tile_k, tile_l); } -void pthreadpool_compute_4d_tiled( +void pthreadpool_parallelize_4d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_4d_tiled_t function, + pthreadpool_task_4d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, size_t range_l, - size_t tile_i, - size_t tile_j, size_t tile_k, - size_t tile_l) + size_t tile_l, + uint32_t flags) { if (threadpool == NULL || threadpool->threads_count <= 1) { - /* No thread pool used: execute function sequentially on the calling thread */ - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { for (size_t k = 0; k < range_k; k += tile_k) { for (size_t l = 0; l < range_l; l += tile_l) { - function(argument, i, j, k, l, - min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); + task(argument, i, j, k, l, + min(range_k - k, tile_k), min(range_l - l, tile_l)); } } } } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } } else { /* Execute in parallel on the thread pool using linearized index */ - const size_t tile_range_i = divide_round_up(range_i, tile_i); - const size_t tile_range_j = divide_round_up(range_j, tile_j); const size_t tile_range_k = divide_round_up(range_k, tile_k); const size_t tile_range_l = divide_round_up(range_l, tile_l); - struct compute_4d_tiled_context context = { - .function = function, + struct compute_4d_tile_2d_context context = { + .task = task, .argument = argument, .tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l), - .tile_range_j = fxdiv_init_size_t(tile_range_j), + .range_j = fxdiv_init_size_t(range_j), .tile_range_l = fxdiv_init_size_t(tile_range_l), - .range_i = range_i, - .range_j = range_j, .range_k = range_k, .range_l = range_l, - .tile_i = tile_i, - .tile_j = tile_j, .tile_k = tile_k, .tile_l = tile_l }; - pthreadpool_compute_1d(threadpool, - (pthreadpool_function_1d_t) compute_4d_tiled, &context, - tile_range_i * tile_range_j * tile_range_k * tile_range_l); + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_4d_tile_2d, &context, + range_i * range_j * tile_range_k * tile_range_l, flags); + } +} + +struct compute_5d_tile_2d_context { + pthreadpool_task_5d_tile_2d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_lm; + struct fxdiv_divisor_size_t range_k; + struct fxdiv_divisor_size_t tile_range_m; + struct fxdiv_divisor_size_t range_j; + size_t range_l; + size_t range_m; + size_t tile_l; + size_t tile_m; +}; + +static void compute_5d_tile_2d(const struct compute_5d_tile_2d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_lm = context->tile_range_lm; + const struct fxdiv_result_size_t tile_index_ijk_lm = fxdiv_divide_size_t(linear_index, tile_range_lm); + const struct fxdiv_divisor_size_t range_k = context->range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lm.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_m = context->tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_ijk_lm.remainder, tile_range_m); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + + const size_t max_tile_l = context->tile_l; + const size_t max_tile_m = context->tile_m; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; + const size_t index_k = tile_index_ij_k.remainder; + const size_t index_l = tile_index_l_m.quotient * max_tile_l; + const size_t index_m = tile_index_l_m.remainder * max_tile_m; + const size_t tile_l = min(max_tile_l, context->range_l - index_l); + const size_t tile_m = min(max_tile_m, context->range_m - index_m); + context->task(context->argument, index_i, index_j, index_k, index_l, index_m, tile_l, tile_m); +} + +void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_5d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t tile_l, + size_t tile_m, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + for (size_t m = 0; m < range_m; m += tile_m) { + task(argument, i, j, k, l, m, + min(range_l - l, tile_l), min(range_m - m, tile_m)); + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_l = divide_round_up(range_l, tile_l); + const size_t tile_range_m = divide_round_up(range_m, tile_m); + struct compute_5d_tile_2d_context context = { + .task = task, + .argument = argument, + .tile_range_lm = fxdiv_init_size_t(tile_range_l * tile_range_m), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + .range_j = fxdiv_init_size_t(range_j), + .range_l = range_l, + .range_m = range_m, + .tile_l = tile_l, + .tile_m = tile_m, + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_5d_tile_2d, &context, + range_i * range_j * range_k * tile_range_l * tile_range_m, flags); + } +} + +struct compute_6d_tile_2d_context { + pthreadpool_task_6d_tile_2d_t task; + void* argument; + struct fxdiv_divisor_size_t tile_range_lmn; + struct fxdiv_divisor_size_t range_k; + struct fxdiv_divisor_size_t tile_range_n; + struct fxdiv_divisor_size_t range_j; + struct fxdiv_divisor_size_t tile_range_m; + size_t range_m; + size_t range_n; + size_t tile_m; + size_t tile_n; +}; + +static void compute_6d_tile_2d(const struct compute_6d_tile_2d_context* context, size_t linear_index) { + const struct fxdiv_divisor_size_t tile_range_lmn = context->tile_range_lmn; + const struct fxdiv_result_size_t tile_index_ijk_lmn = fxdiv_divide_size_t(linear_index, tile_range_lmn); + const struct fxdiv_divisor_size_t range_k = context->range_k; + const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(tile_index_ijk_lmn.quotient, range_k); + const struct fxdiv_divisor_size_t tile_range_n = context->tile_range_n; + const struct fxdiv_result_size_t tile_index_lm_n = fxdiv_divide_size_t(tile_index_ijk_lmn.remainder, tile_range_n); + const struct fxdiv_divisor_size_t range_j = context->range_j; + const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, range_j); + const struct fxdiv_divisor_size_t tile_range_m = context->tile_range_m; + const struct fxdiv_result_size_t tile_index_l_m = fxdiv_divide_size_t(tile_index_lm_n.quotient, tile_range_m); + + const size_t max_tile_m = context->tile_m; + const size_t max_tile_n = context->tile_n; + const size_t index_i = tile_index_i_j.quotient; + const size_t index_j = tile_index_i_j.remainder; + const size_t index_k = tile_index_ij_k.remainder; + const size_t index_l = tile_index_l_m.quotient; + const size_t index_m = tile_index_l_m.remainder * max_tile_m; + const size_t index_n = tile_index_lm_n.remainder * max_tile_n; + const size_t tile_m = min(max_tile_m, context->range_m - index_m); + const size_t tile_n = min(max_tile_n, context->range_n - index_n); + context->task(context->argument, index_i, index_j, index_k, index_l, index_m, index_n, tile_m, tile_n); +} + +void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_6d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t range_n, + size_t tile_m, + size_t tile_n, + uint32_t flags) +{ + if (threadpool == NULL || threadpool->threads_count <= 1) { + /* No thread pool used: execute task sequentially on the calling thread */ + struct fpu_state saved_fpu_state = { 0 }; + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + saved_fpu_state = get_fpu_state(); + disable_fpu_denormals(); + } + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + for (size_t n = 0; n < range_n; n += tile_n) { + task(argument, i, j, k, l, m, n, + min(range_m - m, tile_m), min(range_n - n, tile_n)); + } + } + } + } + } + } + if (flags & PTHREADPOOL_FLAG_DISABLE_DENORMALS) { + set_fpu_state(saved_fpu_state); + } + } else { + /* Execute in parallel on the thread pool using linearized index */ + const size_t tile_range_m = divide_round_up(range_m, tile_m); + const size_t tile_range_n = divide_round_up(range_n, tile_n); + struct compute_6d_tile_2d_context context = { + .task = task, + .argument = argument, + .tile_range_lmn = fxdiv_init_size_t(range_l * tile_range_m * tile_range_n), + .range_k = fxdiv_init_size_t(range_k), + .tile_range_n = fxdiv_init_size_t(tile_range_n), + .range_j = fxdiv_init_size_t(range_j), + .tile_range_m = fxdiv_init_size_t(tile_range_m), + .range_m = range_m, + .range_n = range_n, + .tile_m = tile_m, + .tile_n = tile_n, + }; + pthreadpool_parallelize_1d(threadpool, + (pthreadpool_task_1d_t) compute_6d_tile_2d, &context, + range_i * range_j * range_k * range_l * tile_range_m * tile_range_n, flags); } } diff --git a/src/threadpool-shim.c b/src/threadpool-shim.c index d4d8498..c8ef51d 100644 --- a/src/threadpool-shim.c +++ b/src/threadpool-shim.c @@ -16,99 +16,175 @@ size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) { return 1; } -void pthreadpool_compute_1d( +void pthreadpool_parallelize_1d( struct pthreadpool* threadpool, - pthreadpool_function_1d_t function, + pthreadpool_task_1d_t task, void* argument, - size_t range) + size_t range, + uint32_t flags) { for (size_t i = 0; i < range; i++) { - function(argument, i); + task(argument, i); } } -void pthreadpool_compute_1d_tiled( +void pthreadpool_parallelize_1d_tile_1d( pthreadpool_t threadpool, - pthreadpool_function_1d_tiled_t function, + pthreadpool_task_1d_tile_1d_t task, void* argument, size_t range, - size_t tile) + size_t tile, + uint32_t flags) { for (size_t i = 0; i < range; i += tile) { - function(argument, i, min(range - i, tile)); + task(argument, i, min(range - i, tile)); } } -void pthreadpool_compute_2d( +void pthreadpool_parallelize_2d( struct pthreadpool* threadpool, - pthreadpool_function_2d_t function, + pthreadpool_task_2d_t task, void* argument, size_t range_i, - size_t range_j) + size_t range_j, + uint32_t flags) { for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j++) { - function(argument, i, j); + task(argument, i, j); + } + } +} + +void pthreadpool_parallelize_2d_tile_1d( + pthreadpool_t threadpool, + pthreadpool_task_2d_tile_1d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t tile_j, + uint32_t flags) +{ + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j += tile_j) { + task(argument, i, j, min(range_j - j, tile_j)); } } } -void pthreadpool_compute_2d_tiled( +void pthreadpool_parallelize_2d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_2d_tiled_t function, + pthreadpool_task_2d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t tile_i, - size_t tile_j) + size_t tile_j, + uint32_t flags) { for (size_t i = 0; i < range_i; i += tile_i) { for (size_t j = 0; j < range_j; j += tile_j) { - function(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); + task(argument, i, j, min(range_i - i, tile_i), min(range_j - j, tile_j)); } } } -void pthreadpool_compute_3d_tiled( +void pthreadpool_parallelize_3d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_3d_tiled_t function, + pthreadpool_task_3d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, - size_t tile_i, size_t tile_j, - size_t tile_k) + size_t tile_k, + uint32_t flags) { - for (size_t i = 0; i < range_i; i += tile_i) { + for (size_t i = 0; i < range_i; i++) { for (size_t j = 0; j < range_j; j += tile_j) { for (size_t k = 0; k < range_k; k += tile_k) { - function(argument, i, j, k, - min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k)); + task(argument, i, j, k, + min(range_j - j, tile_j), min(range_k - k, tile_k)); } } } } -void pthreadpool_compute_4d_tiled( +void pthreadpool_parallelize_4d_tile_2d( pthreadpool_t threadpool, - pthreadpool_function_4d_tiled_t function, + pthreadpool_task_4d_tile_2d_t task, void* argument, size_t range_i, size_t range_j, size_t range_k, size_t range_l, - size_t tile_i, - size_t tile_j, size_t tile_k, - size_t tile_l) + size_t tile_l, + uint32_t flags) { - for (size_t i = 0; i < range_i; i += tile_i) { - for (size_t j = 0; j < range_j; j += tile_j) { + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { for (size_t k = 0; k < range_k; k += tile_k) { for (size_t l = 0; l < range_l; l += tile_l) { - function(argument, i, j, k, l, - min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l)); + task(argument, i, j, k, l, + min(range_k - k, tile_k), min(range_l - l, tile_l)); + } + } + } + } +} + +void pthreadpool_parallelize_5d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_5d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t tile_l, + size_t tile_m, + uint32_t flags) +{ + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l += tile_l) { + for (size_t m = 0; m < range_m; m += tile_m) { + task(argument, i, j, k, l, m, + min(range_l - l, tile_l), min(range_m - m, tile_m)); + } + } + } + } + } +} + +void pthreadpool_parallelize_6d_tile_2d( + pthreadpool_t threadpool, + pthreadpool_task_6d_tile_2d_t task, + void* argument, + size_t range_i, + size_t range_j, + size_t range_k, + size_t range_l, + size_t range_m, + size_t range_n, + size_t tile_m, + size_t tile_n, + uint32_t flags) +{ + for (size_t i = 0; i < range_i; i++) { + for (size_t j = 0; j < range_j; j++) { + for (size_t k = 0; k < range_k; k++) { + for (size_t l = 0; l < range_l; l++) { + for (size_t m = 0; m < range_m; m += tile_m) { + for (size_t n = 0; n < range_n; n += tile_n) { + task(argument, i, j, k, l, m, n, + min(range_m - m, tile_m), min(range_n - n, tile_n)); + } + } } } } diff --git a/src/threadpool-utils.h b/src/threadpool-utils.h new file mode 100644 index 0000000..882c596 --- /dev/null +++ b/src/threadpool-utils.h @@ -0,0 +1,62 @@ +#pragma once + +#include <stdint.h> + +#if defined(__SSE__) || defined(__x86_64__) +#include <xmmintrin.h> +#endif + +struct fpu_state { +#if defined(__SSE__) || defined(__x86_64__) + uint32_t mxcsr; +#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) + uint32_t fpscr; +#elif defined(__aarch64__) + uint64_t fpcr; +#else + char unused; +#endif +}; + +static inline struct fpu_state get_fpu_state() { + struct fpu_state state = { 0 }; +#if defined(__SSE__) || defined(__x86_64__) + state.mxcsr = (uint32_t) _mm_getcsr(); +#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) + __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); +#elif defined(__aarch64__) + __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr)); +#endif + return state; +} + +static inline void set_fpu_state(const struct fpu_state state) { +#if defined(__SSE__) || defined(__x86_64__) + _mm_setcsr((unsigned int) state.mxcsr); +#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) + __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); +#elif defined(__aarch64__) + __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); +#endif +} + +static inline void disable_fpu_denormals() { +#if defined(__SSE__) || defined(__x86_64__) + _mm_setcsr(_mm_getcsr() | 0x8040); +#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) + uint32_t fpscr; + __asm__ __volatile__( + "VMRS %[fpscr], fpscr\n" + "ORR %[fpscr], 0x1000000\n" + "VMSR fpscr, %[fpscr]\n" + : [fpscr] "=r" (fpscr)); +#elif defined(__aarch64__) + uint64_t fpcr; + __asm__ __volatile__( + "MRS %[fpcr], fpcr\n" + "ORR %w[fpcr], %w[fpcr], 0x1000000\n" + "ORR %w[fpcr], %w[fpcr], 0x80000\n" + "MSR fpcr, %[fpcr]\n" + : [fpcr] "=r" (fpcr)); +#endif +} diff --git a/test/pthreadpool.cc b/test/pthreadpool.cc index e1c2559..4faf3be 100644 --- a/test/pthreadpool.cc +++ b/test/pthreadpool.cc @@ -2,110 +2,2851 @@ #include <pthreadpool.h> -const size_t itemsCount1D = 1024; +#include <algorithm> +#include <atomic> +#include <cstddef> +#include <memory> -TEST(SetupAndShutdown, Basic) { - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); + +typedef std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> auto_pthreadpool_t; + + +const size_t kParallelize1DRange = 1223; +const size_t kParallelize1DTile1DRange = 1303; +const size_t kParallelize1DTile1DTile = 11; +const size_t kParallelize2DRangeI = 41; +const size_t kParallelize2DRangeJ = 43; +const size_t kParallelize2DTile1DRangeI = 43; +const size_t kParallelize2DTile1DRangeJ = 53; +const size_t kParallelize2DTile1DTileJ = 5; +const size_t kParallelize2DTile2DRangeI = 53; +const size_t kParallelize2DTile2DRangeJ = 59; +const size_t kParallelize2DTile2DTileI = 5; +const size_t kParallelize2DTile2DTileJ = 7; +const size_t kParallelize3DTile2DRangeI = 19; +const size_t kParallelize3DTile2DRangeJ = 23; +const size_t kParallelize3DTile2DRangeK = 29; +const size_t kParallelize3DTile2DTileJ = 2; +const size_t kParallelize3DTile2DTileK = 3; +const size_t kParallelize4DTile2DRangeI = 17; +const size_t kParallelize4DTile2DRangeJ = 19; +const size_t kParallelize4DTile2DRangeK = 23; +const size_t kParallelize4DTile2DRangeL = 29; +const size_t kParallelize4DTile2DTileK = 2; +const size_t kParallelize4DTile2DTileL = 3; +const size_t kParallelize5DTile2DRangeI = 13; +const size_t kParallelize5DTile2DRangeJ = 17; +const size_t kParallelize5DTile2DRangeK = 19; +const size_t kParallelize5DTile2DRangeL = 23; +const size_t kParallelize5DTile2DRangeM = 29; +const size_t kParallelize5DTile2DTileL = 3; +const size_t kParallelize5DTile2DTileM = 2; +const size_t kParallelize6DTile2DRangeI = 7; +const size_t kParallelize6DTile2DRangeJ = 11; +const size_t kParallelize6DTile2DRangeK = 13; +const size_t kParallelize6DTile2DRangeL = 17; +const size_t kParallelize6DTile2DRangeM = 19; +const size_t kParallelize6DTile2DRangeN = 23; +const size_t kParallelize6DTile2DTileM = 3; +const size_t kParallelize6DTile2DTileN = 2; + +const size_t kIncrementIterations = 101; +const size_t kIncrementIterations5D = 7; +const size_t kIncrementIterations6D = 3; + + +TEST(CreateAndDestroy, NullThreadPool) { + pthreadpool* threadpool = nullptr; pthreadpool_destroy(threadpool); } -static void computeNothing1D(void*, size_t) { +TEST(CreateAndDestroy, SingleThreadPool) { + pthreadpool* threadpool = pthreadpool_create(1); + ASSERT_TRUE(threadpool); + pthreadpool_destroy(threadpool); } -TEST(Compute1D, Basic) { +TEST(CreateAndDestroy, MultiThreadPool) { pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); - pthreadpool_compute_1d(threadpool, computeNothing1D, NULL, itemsCount1D); + ASSERT_TRUE(threadpool); pthreadpool_destroy(threadpool); } -static void checkRange1D(void*, size_t itemId) { - EXPECT_LT(itemId, itemsCount1D); +static void ComputeNothing1D(void*, size_t) { } -TEST(Compute1D, ValidRange) { - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); - pthreadpool_compute_1d(threadpool, checkRange1D, NULL, itemsCount1D); - pthreadpool_destroy(threadpool); +TEST(Parallelize1D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d(threadpool.get(), + ComputeNothing1D, + nullptr, + kParallelize1DRange, + 0 /* flags */); +} + +TEST(Parallelize1D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d( + threadpool.get(), + ComputeNothing1D, + nullptr, + kParallelize1DRange, + 0 /* flags */); } -static void setTrue1D(bool indicators[], size_t itemId) { - indicators[itemId] = true; +static void CheckBounds1D(void*, size_t i) { + EXPECT_LT(i, kParallelize1DRange); } -TEST(Compute1D, AllItemsProcessed) { - bool processed[itemsCount1D]; - memset(processed, 0, sizeof(processed)); +TEST(Parallelize1D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); - pthreadpool_compute_1d(threadpool, reinterpret_cast<pthreadpool_function_1d_t>(setTrue1D), processed, itemsCount1D); - for (size_t itemId = 0; itemId < itemsCount1D; itemId++) { - EXPECT_TRUE(processed[itemId]) << "Item " << itemId << " not processed"; + pthreadpool_parallelize_1d( + threadpool.get(), + CheckBounds1D, + nullptr, + kParallelize1DRange, + 0 /* flags */); +} + +TEST(Parallelize1D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); } - pthreadpool_destroy(threadpool); + + pthreadpool_parallelize_1d( + threadpool.get(), + CheckBounds1D, + nullptr, + kParallelize1DRange, + 0 /* flags */); } -static void increment1D(int counters[], size_t itemId) { - counters[itemId] += 1; +static void SetTrue1D(std::atomic_bool* processed_indicators, size_t i) { + processed_indicators[i].store(true, std::memory_order_relaxed); } -TEST(Compute1D, EachItemProcessedOnce) { - int processedCount[itemsCount1D]; - memset(processedCount, 0, sizeof(processedCount)); +TEST(Parallelize1D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize1DRange); - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); - pthreadpool_compute_1d(threadpool, reinterpret_cast<pthreadpool_function_1d_t>(increment1D), processedCount, itemsCount1D); - for (size_t itemId = 0; itemId < itemsCount1D; itemId++) { - EXPECT_EQ(1, processedCount[itemId]) << "Item " << itemId << " processed " << processedCount[itemId] << " times"; + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D), + static_cast<void*>(indicators.data()), + kParallelize1DRange, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; } - pthreadpool_destroy(threadpool); } -TEST(Compute1D, EachItemProcessedMultipleTimes) { - int processedCount[itemsCount1D]; - memset(processedCount, 0, sizeof(processedCount)); - const size_t iterations = 100; +TEST(Parallelize1D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize1DRange); - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - for (size_t iteration = 0; iteration < iterations; iteration++) { - pthreadpool_compute_1d(threadpool, reinterpret_cast<pthreadpool_function_1d_t>(increment1D), processedCount, itemsCount1D); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); } - for (size_t itemId = 0; itemId < itemsCount1D; itemId++) { - EXPECT_EQ(iterations, processedCount[itemId]) << "Item " << itemId << " processed " << processedCount[itemId] << " times"; + + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(SetTrue1D), + static_cast<void*>(indicators.data()), + kParallelize1DRange, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +static void Increment1D(std::atomic_int* processed_counters, size_t i) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); +} + +TEST(Parallelize1D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(Increment1D), + static_cast<void*>(counters.data()), + kParallelize1DRange, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; + } +} + +TEST(Parallelize1D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(Increment1D), + static_cast<void*>(counters.data()), + kParallelize1DRange, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; + } +} + +TEST(Parallelize1D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(Increment1D), + static_cast<void*>(counters.data()), + kParallelize1DRange, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; } - pthreadpool_destroy(threadpool); } -static void workImbalance1D(volatile size_t* computedItems, size_t itemId) { - __sync_fetch_and_add(computedItems, 1); - if (itemId == 0) { - /* Wait until all items are computed */ - while (*computedItems != itemsCount1D) { - __sync_synchronize(); +TEST(Parallelize1D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(Increment1D), + static_cast<void*>(counters.data()), + kParallelize1DRange, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +static void WorkImbalance1D(std::atomic_int* num_processed_items, size_t i) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); } } } -TEST(Compute1D, WorkStealing) { - volatile size_t computedItems = 0; +TEST(Parallelize1D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); - pthreadpool* threadpool = pthreadpool_create(0); - EXPECT_TRUE(threadpool != nullptr); + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); - pthreadpool_compute_1d(threadpool, reinterpret_cast<pthreadpool_function_1d_t>(workImbalance1D), reinterpret_cast<void*>(const_cast<size_t*>(&computedItems)), itemsCount1D); - EXPECT_EQ(computedItems, itemsCount1D); + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } - pthreadpool_destroy(threadpool); + pthreadpool_parallelize_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_t>(WorkImbalance1D), + static_cast<void*>(&num_processed_items), + kParallelize1DRange, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DRange); +} + +static void ComputeNothing1DTile1D(void*, size_t, size_t) { +} + +TEST(Parallelize1DTile1D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d(threadpool.get(), + ComputeNothing1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +TEST(Parallelize1DTile1D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + ComputeNothing1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +static void CheckBounds1DTile1D(void*, size_t start_i, size_t tile_i) { + EXPECT_LT(start_i, kParallelize1DTile1DRange); + EXPECT_LE(start_i + tile_i, kParallelize1DTile1DRange); +} + +TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + CheckBounds1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + CheckBounds1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +static void CheckTiling1DTile1D(void*, size_t start_i, size_t tile_i) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize1DTile1DTile); + EXPECT_EQ(start_i % kParallelize1DTile1DTile, 0); + EXPECT_EQ(tile_i, std::min<size_t>(kParallelize1DTile1DTile, kParallelize1DTile1DRange - start_i)); +} + +TEST(Parallelize1DTile1D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + CheckTiling1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +TEST(Parallelize1DTile1D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + CheckTiling1DTile1D, + nullptr, + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); +} + +static void SetTrue1DTile1D(std::atomic_bool* processed_indicators, size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_indicators[i].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DTile1D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D), + static_cast<void*>(indicators.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +TEST(Parallelize1DTile1D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(SetTrue1DTile1D), + static_cast<void*>(indicators.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_TRUE(indicators[i].load(std::memory_order_relaxed)) + << "Element " << i << " not processed"; + } +} + +static void Increment1DTile1D(std::atomic_int* processed_counters, size_t start_i, size_t tile_i) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + processed_counters[i].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D), + static_cast<void*>(counters.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; + } +} + +TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D), + static_cast<void*>(counters.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), 1) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times (expected: 1)"; + } +} + +TEST(Parallelize1DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D), + static_cast<void*>(counters.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +TEST(Parallelize1DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize1DTile1DRange); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(Increment1DTile1D), + static_cast<void*>(counters.data()), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize1DTile1DRange; i++) { + EXPECT_EQ(counters[i].load(std::memory_order_relaxed), kIncrementIterations) + << "Element " << i << " was processed " << counters[i].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } +} + +static void WorkImbalance1DTile1D(std::atomic_int* num_processed_items, size_t start_i, size_t tile_i) { + num_processed_items->fetch_add(tile_i, std::memory_order_relaxed); + if (start_i == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize1DTile1DRange) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize1DTile1D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_1d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_1d_tile_1d_t>(WorkImbalance1DTile1D), + static_cast<void*>(&num_processed_items), + kParallelize1DTile1DRange, kParallelize1DTile1DTile, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize1DTile1DRange); +} + +static void ComputeNothing2D(void*, size_t, size_t) { +} + +TEST(Parallelize2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d(threadpool.get(), + ComputeNothing2D, + nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); +} + +TEST(Parallelize2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d( + threadpool.get(), + ComputeNothing2D, + nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); +} + +static void CheckBounds2D(void*, size_t i, size_t j) { + EXPECT_LT(i, kParallelize2DRangeI); + EXPECT_LT(j, kParallelize2DRangeJ); +} + +TEST(Parallelize2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d( + threadpool.get(), + CheckBounds2D, + nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); +} + +TEST(Parallelize2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d( + threadpool.get(), + CheckBounds2D, + nullptr, + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); +} + +static void SetTrue2D(std::atomic_bool* processed_indicators, size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); +} + +TEST(Parallelize2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D), + static_cast<void*>(indicators.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(SetTrue2D), + static_cast<void*>(indicators.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2D(std::atomic_int* processed_counters, size_t i, size_t j) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); +} + +TEST(Parallelize2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(Increment2D), + static_cast<void*>(counters.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(Increment2D), + static_cast<void*>(counters.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(Increment2D), + static_cast<void*>(counters.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DRangeI * kParallelize2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(Increment2D), + static_cast<void*>(counters.data()), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void WorkImbalance2D(std::atomic_int* num_processed_items, size_t i, size_t j) { + num_processed_items->fetch_add(1, std::memory_order_relaxed); + if (i == 0 && j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DRangeI * kParallelize2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_t>(WorkImbalance2D), + static_cast<void*>(&num_processed_items), + kParallelize2DRangeI, kParallelize2DRangeJ, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DRangeI * kParallelize2DRangeJ); +} + +static void ComputeNothing2DTile1D(void*, size_t, size_t, size_t) { +} + +TEST(Parallelize2DTile1D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d(threadpool.get(), + ComputeNothing2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile1D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + ComputeNothing2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +static void CheckBounds2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) { + EXPECT_LT(i, kParallelize2DTile1DRangeI); + EXPECT_LT(start_j, kParallelize2DTile1DRangeJ); + EXPECT_LE(start_j + tile_j, kParallelize2DTile1DRangeJ); +} + +TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + CheckBounds2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + CheckBounds2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +static void CheckTiling2DTile1D(void*, size_t i, size_t start_j, size_t tile_j) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile1DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile1DTileJ, 0); + EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile1DTileJ, kParallelize2DTile1DRangeJ - start_j)); +} + +TEST(Parallelize2DTile1D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + CheckTiling2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile1D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + CheckTiling2DTile1D, + nullptr, + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); +} + +static void SetTrue2DTile1D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D), + static_cast<void*>(indicators.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile1D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(SetTrue2DTile1D), + static_cast<void*>(indicators.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile1D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t tile_j) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } +} + +TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D), + static_cast<void*>(counters.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D), + static_cast<void*>(counters.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile1D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D), + static_cast<void*>(counters.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +TEST(Parallelize2DTile1D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(Increment2DTile1D), + static_cast<void*>(counters.data()), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile1DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile1DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile1DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void WorkImbalance2DTile1D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t tile_j) { + num_processed_items->fetch_add(tile_j, std::memory_order_relaxed); + if (i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize2DTile1D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_1d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_1d_t>(WorkImbalance2DTile1D), + static_cast<void*>(&num_processed_items), + kParallelize2DTile1DRangeI, kParallelize2DTile1DRangeJ, kParallelize2DTile1DTileJ, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile1DRangeI * kParallelize2DTile1DRangeJ); +} + +static void ComputeNothing2DTile2D(void*, size_t, size_t, size_t, size_t) { +} + +TEST(Parallelize2DTile2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d(threadpool.get(), + ComputeNothing2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + ComputeNothing2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +static void CheckBounds2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_LT(start_i, kParallelize2DTile2DRangeI); + EXPECT_LT(start_j, kParallelize2DTile2DRangeJ); + EXPECT_LE(start_i + tile_i, kParallelize2DTile2DRangeI); + EXPECT_LE(start_j + tile_j, kParallelize2DTile2DRangeJ); +} + +TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + CheckBounds2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + CheckBounds2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +static void CheckTiling2DTile2D(void*, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + EXPECT_GT(tile_i, 0); + EXPECT_LE(tile_i, kParallelize2DTile2DTileI); + EXPECT_EQ(start_i % kParallelize2DTile2DTileI, 0); + EXPECT_EQ(tile_i, std::min<size_t>(kParallelize2DTile2DTileI, kParallelize2DTile2DRangeI - start_i)); + + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize2DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize2DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min<size_t>(kParallelize2DTile2DTileJ, kParallelize2DTile2DRangeJ - start_j)); +} + +TEST(Parallelize2DTile2D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + CheckTiling2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +TEST(Parallelize2DTile2D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + CheckTiling2DTile2D, + nullptr, + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); +} + +static void SetTrue2DTile2D(std::atomic_bool* processed_indicators, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D), + static_cast<void*>(indicators.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +TEST(Parallelize2DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(SetTrue2DTile2D), + static_cast<void*>(indicators.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ") not processed"; + } + } +} + +static void Increment2DTile2D(std::atomic_int* processed_counters, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + for (size_t i = start_i; i < start_i + tile_i; i++) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D), + static_cast<void*>(counters.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D), + static_cast<void*>(counters.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } +} + +TEST(Parallelize2DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D), + static_cast<void*>(counters.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } } -int main(int argc, char* argv[]) { - setenv("TERM", "xterm-256color", 0); - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); +TEST(Parallelize2DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(Increment2DTile2D), + static_cast<void*>(counters.data()), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize2DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize2DTile2DRangeJ; j++) { + const size_t linear_idx = i * kParallelize2DTile2DRangeJ + j; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } +} + +static void WorkImbalance2DTile2D(std::atomic_int* num_processed_items, size_t start_i, size_t start_j, size_t tile_i, size_t tile_j) { + num_processed_items->fetch_add(tile_i * tile_j, std::memory_order_relaxed); + if (start_i == 0 && start_j == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize2DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_2d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_2d_tile_2d_t>(WorkImbalance2DTile2D), + static_cast<void*>(&num_processed_items), + kParallelize2DTile2DRangeI, kParallelize2DTile2DRangeJ, + kParallelize2DTile2DTileI, kParallelize2DTile2DTileJ, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize2DTile2DRangeI * kParallelize2DTile2DRangeJ); +} + +static void ComputeNothing3DTile2D(void*, size_t, size_t, size_t, size_t, size_t) { +} + +TEST(Parallelize3DTile2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d(threadpool.get(), + ComputeNothing3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +TEST(Parallelize3DTile2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + ComputeNothing3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +static void CheckBounds3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + EXPECT_LT(i, kParallelize3DTile2DRangeI); + EXPECT_LT(start_j, kParallelize3DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize3DTile2DRangeK); + EXPECT_LE(start_j + tile_j, kParallelize3DTile2DRangeJ); + EXPECT_LE(start_k + tile_k, kParallelize3DTile2DRangeK); +} + +TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + CheckBounds3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + CheckBounds3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +static void CheckTiling3DTile2D(void*, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + EXPECT_GT(tile_j, 0); + EXPECT_LE(tile_j, kParallelize3DTile2DTileJ); + EXPECT_EQ(start_j % kParallelize3DTile2DTileJ, 0); + EXPECT_EQ(tile_j, std::min<size_t>(kParallelize3DTile2DTileJ, kParallelize3DTile2DRangeJ - start_j)); + + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize3DTile2DTileK); + EXPECT_EQ(start_k % kParallelize3DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min<size_t>(kParallelize3DTile2DTileK, kParallelize3DTile2DRangeK - start_k)); +} + +TEST(Parallelize3DTile2D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + CheckTiling3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +TEST(Parallelize3DTile2D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + CheckTiling3DTile2D, + nullptr, + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); +} + +static void SetTrue3DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D), + static_cast<void*>(indicators.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +TEST(Parallelize3DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(SetTrue3DTile2D), + static_cast<void*>(indicators.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ") not processed"; + } + } + } +} + +static void Increment3DTile2D(std::atomic_int* processed_counters, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + for (size_t j = start_j; j < start_j + tile_j; j++) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D), + static_cast<void*>(counters.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D), + static_cast<void*>(counters.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } +} + +TEST(Parallelize3DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D), + static_cast<void*>(counters.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +TEST(Parallelize3DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(Increment3DTile2D), + static_cast<void*>(counters.data()), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize3DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize3DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize3DTile2DRangeK; k++) { + const size_t linear_idx = (i * kParallelize3DTile2DRangeJ + j) * kParallelize3DTile2DRangeK + k; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } +} + +static void WorkImbalance3DTile2D(std::atomic_int* num_processed_items, size_t i, size_t start_j, size_t start_k, size_t tile_j, size_t tile_k) { + num_processed_items->fetch_add(tile_j * tile_k, std::memory_order_relaxed); + if (i == 0 && start_j == 0 && start_k == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize3DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_3d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_3d_tile_2d_t>(WorkImbalance3DTile2D), + static_cast<void*>(&num_processed_items), + kParallelize3DTile2DRangeI, kParallelize3DTile2DRangeJ, kParallelize3DTile2DRangeK, + kParallelize3DTile2DTileJ, kParallelize3DTile2DTileK, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize3DTile2DRangeI * kParallelize3DTile2DRangeJ * kParallelize3DTile2DRangeK); +} + +static void ComputeNothing4DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t) { +} + +TEST(Parallelize4DTile2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d(threadpool.get(), + ComputeNothing4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +TEST(Parallelize4DTile2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + ComputeNothing4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +static void CheckBounds4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + EXPECT_LT(i, kParallelize4DTile2DRangeI); + EXPECT_LT(j, kParallelize4DTile2DRangeJ); + EXPECT_LT(start_k, kParallelize4DTile2DRangeK); + EXPECT_LT(start_l, kParallelize4DTile2DRangeL); + EXPECT_LE(start_k + tile_k, kParallelize4DTile2DRangeK); + EXPECT_LE(start_l + tile_l, kParallelize4DTile2DRangeL); +} + +TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + CheckBounds4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + CheckBounds4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +static void CheckTiling4DTile2D(void*, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + EXPECT_GT(tile_k, 0); + EXPECT_LE(tile_k, kParallelize4DTile2DTileK); + EXPECT_EQ(start_k % kParallelize4DTile2DTileK, 0); + EXPECT_EQ(tile_k, std::min<size_t>(kParallelize4DTile2DTileK, kParallelize4DTile2DRangeK - start_k)); + + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize4DTile2DTileL); + EXPECT_EQ(start_l % kParallelize4DTile2DTileL, 0); + EXPECT_EQ(tile_l, std::min<size_t>(kParallelize4DTile2DTileL, kParallelize4DTile2DRangeL - start_l)); +} + +TEST(Parallelize4DTile2D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + CheckTiling4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +TEST(Parallelize4DTile2D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + CheckTiling4DTile2D, + nullptr, + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); +} + +static void SetTrue4DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D), + static_cast<void*>(indicators.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; + } + } + } + } +} + +TEST(Parallelize4DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(SetTrue4DTile2D), + static_cast<void*>(indicators.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") not processed"; + } + } + } + } +} + +static void Increment4DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + for (size_t k = start_k; k < start_k + tile_k; k++) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D), + static_cast<void*>(counters.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D), + static_cast<void*>(counters.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } +} + +TEST(Parallelize4DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D), + static_cast<void*>(counters.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +TEST(Parallelize4DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations; iteration++) { + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(Increment4DTile2D), + static_cast<void*>(counters.data()), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize4DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize4DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize4DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize4DTile2DRangeL; l++) { + const size_t linear_idx = ((i * kParallelize4DTile2DRangeJ + j) * kParallelize4DTile2DRangeK + k) * kParallelize4DTile2DRangeL + l; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations << ")"; + } + } + } + } +} + +static void WorkImbalance4DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t start_k, size_t start_l, size_t tile_k, size_t tile_l) { + num_processed_items->fetch_add(tile_k * tile_l, std::memory_order_relaxed); + if (i == 0 && j == 0 && start_k == 0 && start_l == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize4DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_4d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_4d_tile_2d_t>(WorkImbalance4DTile2D), + static_cast<void*>(&num_processed_items), + kParallelize4DTile2DRangeI, kParallelize4DTile2DRangeJ, kParallelize4DTile2DRangeK, kParallelize4DTile2DRangeL, + kParallelize4DTile2DTileK, kParallelize4DTile2DTileL, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize4DTile2DRangeI * kParallelize4DTile2DRangeJ * kParallelize4DTile2DRangeK * kParallelize4DTile2DRangeL); +} + +static void ComputeNothing5DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { +} + +TEST(Parallelize5DTile2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d(threadpool.get(), + ComputeNothing5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +TEST(Parallelize5DTile2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + ComputeNothing5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +static void CheckBounds5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { + EXPECT_LT(i, kParallelize5DTile2DRangeI); + EXPECT_LT(j, kParallelize5DTile2DRangeJ); + EXPECT_LT(k, kParallelize5DTile2DRangeK); + EXPECT_LT(start_l, kParallelize5DTile2DRangeL); + EXPECT_LT(start_m, kParallelize5DTile2DRangeM); + EXPECT_LE(start_l + tile_l, kParallelize5DTile2DRangeL); + EXPECT_LE(start_m + tile_m, kParallelize5DTile2DRangeM); +} + +TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + CheckBounds5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + CheckBounds5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +static void CheckTiling5DTile2D(void*, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { + EXPECT_GT(tile_l, 0); + EXPECT_LE(tile_l, kParallelize5DTile2DTileL); + EXPECT_EQ(start_l % kParallelize5DTile2DTileL, 0); + EXPECT_EQ(tile_l, std::min<size_t>(kParallelize5DTile2DTileL, kParallelize5DTile2DRangeL - start_l)); + + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize5DTile2DTileM); + EXPECT_EQ(start_m % kParallelize5DTile2DTileM, 0); + EXPECT_EQ(tile_m, std::min<size_t>(kParallelize5DTile2DTileM, kParallelize5DTile2DRangeM - start_m)); +} + +TEST(Parallelize5DTile2D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + CheckTiling5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +TEST(Parallelize5DTile2D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + CheckTiling5DTile2D, + nullptr, + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); +} + +static void SetTrue5DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize5DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D), + static_cast<void*>(indicators.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; + } + } + } + } + } +} + +TEST(Parallelize5DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(SetTrue5DTile2D), + static_cast<void*>(indicators.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") not processed"; + } + } + } + } + } +} + +static void Increment5DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { + for (size_t l = start_l; l < start_l + tile_l; l++) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D), + static_cast<void*>(counters.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } + } +} + +TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D), + static_cast<void*>(counters.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } + } +} + +TEST(Parallelize5DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D), + static_cast<void*>(counters.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } +} + +TEST(Parallelize5DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations5D; iteration++) { + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(Increment5DTile2D), + static_cast<void*>(counters.data()), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize5DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize5DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize5DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize5DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize5DTile2DRangeM; m++) { + const size_t linear_idx = (((i * kParallelize5DTile2DRangeJ + j) * kParallelize5DTile2DRangeK + k) * kParallelize5DTile2DRangeL + l) * kParallelize5DTile2DRangeM + m; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations5D) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations5D << ")"; + } + } + } + } + } +} + +static void WorkImbalance5DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t start_l, size_t start_m, size_t tile_l, size_t tile_m) { + num_processed_items->fetch_add(tile_l * tile_m, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && start_l == 0 && start_m == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize5DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_5d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_5d_tile_2d_t>(WorkImbalance5DTile2D), + static_cast<void*>(&num_processed_items), + kParallelize5DTile2DRangeI, kParallelize5DTile2DRangeJ, kParallelize5DTile2DRangeK, kParallelize5DTile2DRangeL, kParallelize5DTile2DRangeM, + kParallelize5DTile2DTileL, kParallelize5DTile2DTileM, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize5DTile2DRangeI * kParallelize5DTile2DRangeJ * kParallelize5DTile2DRangeK * kParallelize5DTile2DRangeL * kParallelize5DTile2DRangeM); +} + +static void ComputeNothing6DTile2D(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t) { +} + +TEST(Parallelize6DTile2D, SingleThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d(threadpool.get(), + ComputeNothing6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +TEST(Parallelize6DTile2D, MultiThreadPoolCompletes) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + ComputeNothing6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +static void CheckBounds6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { + EXPECT_LT(i, kParallelize6DTile2DRangeI); + EXPECT_LT(j, kParallelize6DTile2DRangeJ); + EXPECT_LT(k, kParallelize6DTile2DRangeK); + EXPECT_LT(l, kParallelize6DTile2DRangeL); + EXPECT_LT(start_m, kParallelize6DTile2DRangeM); + EXPECT_LT(start_n, kParallelize6DTile2DRangeN); + EXPECT_LE(start_m + tile_m, kParallelize6DTile2DRangeM); + EXPECT_LE(start_n + tile_n, kParallelize6DTile2DRangeN); +} + +TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + CheckBounds6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsInBounds) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + CheckBounds6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +static void CheckTiling6DTile2D(void*, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { + EXPECT_GT(tile_m, 0); + EXPECT_LE(tile_m, kParallelize6DTile2DTileM); + EXPECT_EQ(start_m % kParallelize6DTile2DTileM, 0); + EXPECT_EQ(tile_m, std::min<size_t>(kParallelize6DTile2DTileM, kParallelize6DTile2DRangeM - start_m)); + + EXPECT_GT(tile_n, 0); + EXPECT_LE(tile_n, kParallelize6DTile2DTileN); + EXPECT_EQ(start_n % kParallelize6DTile2DTileN, 0); + EXPECT_EQ(tile_n, std::min<size_t>(kParallelize6DTile2DTileN, kParallelize6DTile2DRangeN - start_n)); +} + +TEST(Parallelize6DTile2D, SingleThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + CheckTiling6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +TEST(Parallelize6DTile2D, MultiThreadPoolUniformTiling) { + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + CheckTiling6DTile2D, + nullptr, + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); +} + +static void SetTrue6DTile2D(std::atomic_bool* processed_indicators, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + processed_indicators[linear_idx].store(true, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize6DTile2D, SingleThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D), + static_cast<void*>(indicators.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } +} + +TEST(Parallelize6DTile2D, MultiThreadPoolAllItemsProcessed) { + std::vector<std::atomic_bool> indicators(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(SetTrue6DTile2D), + static_cast<void*>(indicators.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_TRUE(indicators[linear_idx].load(std::memory_order_relaxed)) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") not processed"; + } + } + } + } + } + } +} + +static void Increment6DTile2D(std::atomic_int* processed_counters, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { + for (size_t m = start_m; m < start_m + tile_m; m++) { + for (size_t n = start_n; n < start_n + tile_n; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + processed_counters[linear_idx].fetch_add(1, std::memory_order_relaxed); + } + } +} + +TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D), + static_cast<void*>(counters.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } + } + } +} + +TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedOnce) { + std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D), + static_cast<void*>(counters.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), 1) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times (expected: 1)"; + } + } + } + } + } + } +} + +TEST(Parallelize6DTile2D, SingleThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(1), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D), + static_cast<void*>(counters.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } +} + +TEST(Parallelize6DTile2D, MultiThreadPoolEachItemProcessedMultipleTimes) { + std::vector<std::atomic_int> counters(kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + for (size_t iteration = 0; iteration < kIncrementIterations6D; iteration++) { + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(Increment6DTile2D), + static_cast<void*>(counters.data()), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + } + + for (size_t i = 0; i < kParallelize6DTile2DRangeI; i++) { + for (size_t j = 0; j < kParallelize6DTile2DRangeJ; j++) { + for (size_t k = 0; k < kParallelize6DTile2DRangeK; k++) { + for (size_t l = 0; l < kParallelize6DTile2DRangeL; l++) { + for (size_t m = 0; m < kParallelize6DTile2DRangeM; m++) { + for (size_t n = 0; n < kParallelize6DTile2DRangeN; n++) { + const size_t linear_idx = ((((i * kParallelize6DTile2DRangeJ + j) * kParallelize6DTile2DRangeK + k) * kParallelize6DTile2DRangeL + l) * kParallelize6DTile2DRangeM + m) * kParallelize6DTile2DRangeN + n; + EXPECT_EQ(counters[linear_idx].load(std::memory_order_relaxed), kIncrementIterations6D) + << "Element (" << i << ", " << j << ", " << k << ", " << l << ", " << m << ", " << n << ") was processed " + << counters[linear_idx].load(std::memory_order_relaxed) << " times " + << "(expected: " << kIncrementIterations6D << ")"; + } + } + } + } + } + } +} + +static void WorkImbalance6DTile2D(std::atomic_int* num_processed_items, size_t i, size_t j, size_t k, size_t l, size_t start_m, size_t start_n, size_t tile_m, size_t tile_n) { + num_processed_items->fetch_add(tile_m * tile_n, std::memory_order_relaxed); + if (i == 0 && j == 0 && k == 0 && l == 0 && start_m == 0 && start_n == 0) { + /* Spin-wait until all items are computed */ + while (num_processed_items->load(std::memory_order_relaxed) != kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN) { + std::atomic_thread_fence(std::memory_order_acquire); + } + } +} + +TEST(Parallelize6DTile2D, MultiThreadPoolWorkStealing) { + std::atomic_int num_processed_items = ATOMIC_VAR_INIT(0); + + auto_pthreadpool_t threadpool(pthreadpool_create(0), pthreadpool_destroy); + ASSERT_TRUE(threadpool.get()); + + if (pthreadpool_get_threads_count(threadpool.get()) <= 1) { + GTEST_SKIP(); + } + + pthreadpool_parallelize_6d_tile_2d( + threadpool.get(), + reinterpret_cast<pthreadpool_task_6d_tile_2d_t>(WorkImbalance6DTile2D), + static_cast<void*>(&num_processed_items), + kParallelize6DTile2DRangeI, kParallelize6DTile2DRangeJ, kParallelize6DTile2DRangeK, kParallelize6DTile2DRangeL, kParallelize6DTile2DRangeM, kParallelize6DTile2DRangeN, + kParallelize6DTile2DTileM, kParallelize6DTile2DTileN, + 0 /* flags */); + EXPECT_EQ(num_processed_items.load(std::memory_order_relaxed), kParallelize6DTile2DRangeI * kParallelize6DTile2DRangeJ * kParallelize6DTile2DRangeK * kParallelize6DTile2DRangeL * kParallelize6DTile2DRangeM * kParallelize6DTile2DRangeN); } |