Support 3D and 4D tiled computation

author: Marat Dukhan <marat@fb.com> 2018-10-08 00:19:36 -0700
committer: Marat Dukhan <marat@fb.com> 2018-10-08 00:19:36 -0700
commit: 13da0b4c21d17f94150713366420baaf1b5a46f4 (patch)
tree: 164be7551021064094501d005ab712fc9b4bef11
parent: 3fb19c58b46f3cbc78a27c7b207a6eb7946633c0 (diff)
download: pthreadpool-13da0b4c21d17f94150713366420baaf1b5a46f4.tar.gz
3 files changed, 235 insertions, 1 deletions
diff --git a/include/pthreadpool.h b/include/pthreadpool.h
index fa2bb1e..a99105e 100644
--- a/include/pthreadpool.h
+++ b/include/pthreadpool.h
@@ -9,7 +9,8 @@ typedef void (*pthreadpool_function_1d_t)(void*, size_t);
 typedef void (*pthreadpool_function_1d_tiled_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_t)(void*, size_t, size_t);
 typedef void (*pthreadpool_function_2d_tiled_t)(void*, size_t, size_t, size_t, size_t);
-typedef void (*pthreadpool_function_3d_t)(void*, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_3d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t);
+typedef void (*pthreadpool_function_4d_tiled_t)(void*, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
 
 #ifdef __cplusplus
 extern "C" {
@@ -81,6 +82,30 @@ void pthreadpool_compute_2d_tiled(
 	size_t tile_i,
 	size_t tile_j);
 
+void pthreadpool_compute_3d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_3d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k);
+
+void pthreadpool_compute_4d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_4d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k,
+	size_t tile_l);
+
 /**
  * Terminates threads in the thread pool and releases associated resources.
  *
diff --git a/src/threadpool-pthreads.c b/src/threadpool-pthreads.c
index ff0817e..4905819 100644
--- a/src/threadpool-pthreads.c
+++ b/src/threadpool-pthreads.c
@@ -592,6 +592,169 @@ void pthreadpool_compute_2d_tiled(
 	}
 }
 
+struct compute_3d_tiled_context {
+	pthreadpool_function_3d_tiled_t function;
+	void* argument;
+	struct fxdiv_divisor_size_t tile_range_j;
+	struct fxdiv_divisor_size_t tile_range_k;
+	size_t range_i;
+	size_t range_j;
+	size_t range_k;
+	size_t tile_i;
+	size_t tile_j;
+	size_t tile_k;
+};
+
+static void compute_3d_tiled(const struct compute_3d_tiled_context* context, size_t linear_index) {
+	const struct fxdiv_divisor_size_t tile_range_k = context->tile_range_k;
+	const struct fxdiv_result_size_t tile_index_ij_k = fxdiv_divide_size_t(linear_index, tile_range_k);
+	const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
+	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_k.quotient, tile_range_j);
+	const size_t max_tile_i = context->tile_i;
+	const size_t max_tile_j = context->tile_j;
+	const size_t max_tile_k = context->tile_k;
+	const size_t index_i = tile_index_i_j.quotient * max_tile_i;
+	const size_t index_j = tile_index_i_j.remainder * max_tile_j;
+	const size_t index_k = tile_index_ij_k.remainder * max_tile_k;
+	const size_t tile_i = min(max_tile_i, context->range_i - index_i);
+	const size_t tile_j = min(max_tile_j, context->range_j - index_j);
+	const size_t tile_k = min(max_tile_k, context->range_k - index_k);
+	context->function(context->argument, index_i, index_j, index_k, tile_i, tile_j, tile_k);
+}
+
+void pthreadpool_compute_3d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_3d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k)
+{
+	if (threadpool == NULL) {
+		/* No thread pool provided: execute function sequentially on the calling thread */
+		for (size_t i = 0; i < range_i; i += tile_i) {
+			for (size_t j = 0; j < range_j; j += tile_j) {
+				for (size_t k = 0; k < range_k; k += tile_k) {
+					function(argument, i, j, k, min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k));
+				}
+			}
+		}
+	} else {
+		/* Execute in parallel on the thread pool using linearized index */
+		const size_t tile_range_i = divide_round_up(range_i, tile_i);
+		const size_t tile_range_j = divide_round_up(range_j, tile_j);
+		const size_t tile_range_k = divide_round_up(range_k, tile_k);
+		struct compute_3d_tiled_context context = {
+			.function = function,
+			.argument = argument,
+			.tile_range_j = fxdiv_init_size_t(tile_range_j),
+			.tile_range_k = fxdiv_init_size_t(tile_range_k),
+			.range_i = range_i,
+			.range_j = range_j,
+			.range_k = range_k,
+			.tile_i = tile_i,
+			.tile_j = tile_j,
+			.tile_k = tile_k
+		};
+		pthreadpool_compute_1d(threadpool,
+			(pthreadpool_function_1d_t) compute_3d_tiled, &context,
+			tile_range_i * tile_range_j * tile_range_k);
+	}
+}
+
+struct compute_4d_tiled_context {
+	pthreadpool_function_4d_tiled_t function;
+	void* argument;
+	struct fxdiv_divisor_size_t tile_range_kl;
+	struct fxdiv_divisor_size_t tile_range_j;
+	struct fxdiv_divisor_size_t tile_range_l;
+	size_t range_i;
+	size_t range_j;
+	size_t range_k;
+	size_t range_l;
+	size_t tile_i;
+	size_t tile_j;
+	size_t tile_k;
+	size_t tile_l;
+};
+
+static void compute_4d_tiled(const struct compute_4d_tiled_context* context, size_t linear_index) {
+	const struct fxdiv_divisor_size_t tile_range_kl = context->tile_range_kl;
+	const struct fxdiv_result_size_t tile_index_ij_kl = fxdiv_divide_size_t(linear_index, tile_range_kl);
+	const struct fxdiv_divisor_size_t tile_range_j = context->tile_range_j;
+	const struct fxdiv_result_size_t tile_index_i_j = fxdiv_divide_size_t(tile_index_ij_kl.quotient, tile_range_j);
+	const struct fxdiv_divisor_size_t tile_range_l = context->tile_range_l;
+	const struct fxdiv_result_size_t tile_index_k_l = fxdiv_divide_size_t(tile_index_ij_kl.remainder, tile_range_l);
+	const size_t max_tile_i = context->tile_i;
+	const size_t max_tile_j = context->tile_j;
+	const size_t max_tile_k = context->tile_k;
+	const size_t max_tile_l = context->tile_l;
+	const size_t index_i = tile_index_i_j.quotient * max_tile_i;
+	const size_t index_j = tile_index_i_j.remainder * max_tile_j;
+	const size_t index_k = tile_index_k_l.quotient * max_tile_k;
+	const size_t index_l = tile_index_k_l.remainder * max_tile_l;
+	const size_t tile_i = min(max_tile_i, context->range_i - index_i);
+	const size_t tile_j = min(max_tile_j, context->range_j - index_j);
+	const size_t tile_k = min(max_tile_k, context->range_k - index_k);
+	const size_t tile_l = min(max_tile_l, context->range_l - index_l);
+	context->function(context->argument, index_i, index_j, index_k, index_l, tile_i, tile_j, tile_k, tile_l);
+}
+
+void pthreadpool_compute_4d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_4d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k,
+	size_t tile_l)
+{
+	if (threadpool == NULL) {
+		/* No thread pool provided: execute function sequentially on the calling thread */
+		for (size_t i = 0; i < range_i; i += tile_i) {
+			for (size_t j = 0; j < range_j; j += tile_j) {
+				for (size_t k = 0; k < range_k; k += tile_k) {
+					for (size_t l = 0; l < range_l; l += tile_l) {
+						function(argument, i, j, k, l,
+							min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l));
+					}
+				}
+			}
+		}
+	} else {
+		/* Execute in parallel on the thread pool using linearized index */
+		const size_t tile_range_i = divide_round_up(range_i, tile_i);
+		const size_t tile_range_j = divide_round_up(range_j, tile_j);
+		const size_t tile_range_k = divide_round_up(range_k, tile_k);
+		const size_t tile_range_l = divide_round_up(range_l, tile_l);
+		struct compute_4d_tiled_context context = {
+			.function = function,
+			.argument = argument,
+			.tile_range_kl = fxdiv_init_size_t(tile_range_k * tile_range_l),
+			.tile_range_j = fxdiv_init_size_t(tile_range_j),
+			.tile_range_l = fxdiv_init_size_t(tile_range_l),
+			.range_i = range_i,
+			.range_j = range_j,
+			.range_k = range_k,
+			.range_l = range_l,
+			.tile_i = tile_i,
+			.tile_j = tile_j,
+			.tile_k = tile_k,
+			.tile_l = tile_l
+		};
+		pthreadpool_compute_1d(threadpool,
+			(pthreadpool_function_1d_t) compute_4d_tiled, &context,
+			tile_range_i * tile_range_j * tile_range_k * tile_range_l);
+	}
+}
+
 void pthreadpool_destroy(struct pthreadpool* threadpool) {
 	if (threadpool != NULL) {
 		#if PTHREADPOOL_USE_FUTEX
diff --git a/src/threadpool-shim.c b/src/threadpool-shim.c
index 6a9262f..d4d8498 100644
--- a/src/threadpool-shim.c
+++ b/src/threadpool-shim.c
@@ -69,5 +69,51 @@ void pthreadpool_compute_2d_tiled(
 	}
 }
 
+void pthreadpool_compute_3d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_3d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k)
+{
+	for (size_t i = 0; i < range_i; i += tile_i) {
+		for (size_t j = 0; j < range_j; j += tile_j) {
+			for (size_t k = 0; k < range_k; k += tile_k) {
+				function(argument, i, j, k,
+					min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k));
+			}
+		}
+	}
+}
+
+void pthreadpool_compute_4d_tiled(
+	pthreadpool_t threadpool,
+	pthreadpool_function_4d_tiled_t function,
+	void* argument,
+	size_t range_i,
+	size_t range_j,
+	size_t range_k,
+	size_t range_l,
+	size_t tile_i,
+	size_t tile_j,
+	size_t tile_k,
+	size_t tile_l)
+{
+	for (size_t i = 0; i < range_i; i += tile_i) {
+		for (size_t j = 0; j < range_j; j += tile_j) {
+			for (size_t k = 0; k < range_k; k += tile_k) {
+				for (size_t l = 0; l < range_l; l += tile_l) {
+					function(argument, i, j, k, l,
+						min(range_i - i, tile_i), min(range_j - j, tile_j), min(range_k - k, tile_k), min(range_l - l, tile_l));
+				}
+			}
+		}
+	}
+}
+
 void pthreadpool_destroy(struct pthreadpool* threadpool) {
 }
author	Marat Dukhan <marat@fb.com>	2018-10-08 00:19:36 -0700
committer	Marat Dukhan <marat@fb.com>	2018-10-08 00:19:36 -0700
commit	13da0b4c21d17f94150713366420baaf1b5a46f4 (patch)
tree	164be7551021064094501d005ab712fc9b4bef11
parent	3fb19c58b46f3cbc78a27c7b207a6eb7946633c0 (diff)
download	pthreadpool-13da0b4c21d17f94150713366420baaf1b5a46f4.tar.gz