aboutsummaryrefslogtreecommitdiff
path: root/unsupported/test/cxx11_tensor_executor.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/test/cxx11_tensor_executor.cpp')
-rw-r--r--unsupported/test/cxx11_tensor_executor.cpp731
1 files changed, 731 insertions, 0 deletions
diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp
new file mode 100644
index 000000000..66b06e8ee
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_executor.cpp
@@ -0,0 +1,731 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+using Eigen::ColMajor;
+using Eigen::internal::TiledEvaluation;
+
+// A set of tests to verify that different TensorExecutor strategies yields the
+// same results for all the ops, supporting tiled evaluation.
+
+// Default assignment that does no use block evaluation or vectorization.
+// We assume that default coefficient evaluation is well tested and correct.
+template <typename Dst, typename Expr>
+static void DefaultAssign(Dst& dst, Expr expr) {
+ using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+ using Executor =
+ Eigen::internal::TensorExecutor<const Assign, DefaultDevice,
+ /*Vectorizable=*/false,
+ /*Tiling=*/TiledEvaluation::Off>;
+
+ Executor::run(Assign(dst, expr), DefaultDevice());
+}
+
+// Assignment with specified device and tiling strategy.
+template <bool Vectorizable, TiledEvaluation Tiling, typename Device,
+ typename Dst, typename Expr>
+static void DeviceAssign(Device& d, Dst& dst, Expr expr) {
+ using Assign = Eigen::TensorAssignOp<Dst, const Expr>;
+ using Executor = Eigen::internal::TensorExecutor<const Assign, Device,
+ Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+}
+
+template <int NumDims>
+static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) {
+ array<Index, NumDims> dims;
+ for (int i = 0; i < NumDims; ++i) {
+ dims[i] = internal::random<int>(min_dim, max_dim);
+ }
+ return dims;
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_unary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> src(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ src.setRandom();
+ const auto expr = src.square();
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T square = src.coeff(i) * src.coeff(i);
+ VERIFY_IS_EQUAL(square, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_binary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> lhs(dims);
+ Tensor<T, NumDims, Options, Index> rhs(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ lhs.setRandom();
+ rhs.setRandom();
+
+ const auto expr = lhs + rhs;
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T sum = lhs.coeff(i) + rhs.coeff(i);
+ VERIFY_IS_EQUAL(sum, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto broadcasts = RandomDims<NumDims>(1, 7);
+ const auto expr = src.broadcast(broadcasts);
+
+ // We assume that broadcasting on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_rvalue(Device d)
+{
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Layout, Index> src(dims);
+ src.setRandom();
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ const auto expr = src.template chip<(CHIP_DIM)>(offset); \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> golden; \
+ golden = expr; \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \
+ \
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tiling>; \
+ \
+ Executor::run(Assign(dst, expr), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_chipping_lvalue(Device d)
+{
+ auto dims = RandomDims<NumDims>(1, 10);
+
+#define TEST_CHIPPING(CHIP_DIM) \
+ if (NumDims > (CHIP_DIM)) { \
+ /* Generate random data that we'll assign to the chipped tensor dim. */ \
+ array<Index, NumDims - 1> src_dims; \
+ for (int i = 0; i < NumDims - 1; ++i) { \
+ int dim = i < (CHIP_DIM) ? i : i + 1; \
+ src_dims[i] = dims[dim]; \
+ } \
+ \
+ Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \
+ src.setRandom(); \
+ \
+ const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \
+ \
+ Tensor<T, NumDims, Layout, Index> random(dims); \
+ random.setZero(); \
+ \
+ Tensor<T, NumDims, Layout, Index> golden(dims); \
+ golden = random; \
+ golden.template chip<(CHIP_DIM)>(offset) = src; \
+ \
+ Tensor<T, NumDims, Layout, Index> dst(dims); \
+ dst = random; \
+ auto expr = dst.template chip<(CHIP_DIM)>(offset); \
+ \
+ using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \
+ using Executor = internal::TensorExecutor<const Assign, Device, \
+ Vectorizable, Tiling>; \
+ \
+ Executor::run(Assign(expr, src), d); \
+ \
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \
+ } \
+ }
+
+ TEST_CHIPPING(0)
+ TEST_CHIPPING(1)
+ TEST_CHIPPING(2)
+ TEST_CHIPPING(3)
+ TEST_CHIPPING(4)
+ TEST_CHIPPING(5)
+
+#undef TEST_CHIPPING
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_rvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ // Test all possible shuffle permutations.
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) {
+ shuffled_dims[i] = dims[shuffle[i]];
+ }
+
+ const auto expr = src.shuffle(shuffle);
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+ DefaultAssign(golden, expr);
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+ DeviceAssign<Vectorizable, Tiling>(d, dst, expr);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_shuffle_lvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ DSizes<Index, NumDims> shuffle;
+ for (int i = 0; i < NumDims; ++i) shuffle[i] = i;
+
+ // Test all possible shuffle permutations.
+ do {
+ DSizes<Index, NumDims> shuffled_dims;
+ for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i];
+
+ // We assume that shuffling on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden(shuffled_dims);
+ auto golden_shuffle = golden.shuffle(shuffle);
+ DefaultAssign(golden_shuffle, src);
+
+ // Now do the shuffling using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(shuffled_dims);
+ auto dst_shuffle = dst.shuffle(shuffle);
+ DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+
+ } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims));
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_reshape(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+
+ static constexpr int ReshapedDims = NumDims - 1;
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Multiple 0th dimension and then shuffle.
+ std::vector<Index> shuffle;
+ for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i);
+ std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937());
+
+ DSizes<Index, ReshapedDims> reshaped_dims;
+ reshaped_dims[shuffle[0]] = dims[0] * dims[1];
+ for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1];
+
+ Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.reshape(reshaped_dims);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_rvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>());
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>());
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> golden =
+ src.slice(slice_start, slice_size);
+
+ // Now reshape using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ auto expr = src.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_slice_lvalue(Device d)
+{
+ static_assert(NumDims >= 2, "NumDims must be greater or equal than 2");
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(5, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Pick a random slice of src tensor.
+ auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+ auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10));
+
+ // Make sure that slice start + size do not overflow tensor dims.
+ for (int i = 0; i < NumDims; ++i) {
+ slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]);
+ slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]);
+ }
+
+ Tensor<T, NumDims, Options, Index> slice(slice_size);
+ slice.setRandom();
+
+ // Assign a slice using default executor.
+ Tensor<T, NumDims, Options, Index> golden = src;
+ golden.slice(slice_start, slice_size) = slice;
+
+ // And using configured execution strategy.
+ Tensor<T, NumDims, Options, Index> dst = src;
+ auto expr = dst.slice(slice_start, slice_size);
+
+ using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(expr, slice), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_broadcasting_of_forced_eval(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, 10);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto broadcasts = RandomDims<NumDims>(1, 7);
+ const auto expr = src.square().eval().broadcast(broadcasts);
+
+ // We assume that broadcasting on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template<typename T, int NumDims>
+struct DummyGenerator {
+ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+ T operator()(const array <Index, NumDims>& dims) const {
+ T result = static_cast<T>(0);
+ for (int i = 0; i < NumDims; ++i) {
+ result += static_cast<T>((i + 1) * dims[i]);
+ }
+ return result;
+ }
+};
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_generator_op(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(20, 30);
+ Tensor<T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ const auto expr = src.generate(DummyGenerator<T, NumDims>());
+
+ // We assume that generator on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor<T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the broadcasting using configured tensor executor.
+ Tensor<T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_execute_reverse_rvalue(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims));
+ Tensor <T, NumDims, Options, Index> src(dims);
+ src.setRandom();
+
+ // Reverse half of the dimensions.
+ Eigen::array<bool, NumDims> reverse;
+ for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>();
+
+ const auto expr = src.reverse(reverse);
+
+ // We assume that reversing on a default device is tested and correct, so
+ // we can rely on it to verify correctness of tensor executor and tiling.
+ Tensor <T, NumDims, Options, Index> golden;
+ golden = expr;
+
+ // Now do the reversing using configured tensor executor.
+ Tensor <T, NumDims, Options, Index> dst(golden.dimensions());
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using Executor =
+ internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>;
+
+ Executor::run(Assign(dst, expr), d);
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_async_execute_unary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> src(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ src.setRandom();
+ const auto expr = src.square();
+
+ Eigen::Barrier done(1);
+ auto on_done = [&done]() { done.Notify(); };
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using DoneCallback = decltype(on_done);
+ using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+ Vectorizable, Tiling>;
+
+ Executor::runAsync(Assign(dst, expr), d, on_done);
+ done.Wait();
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T square = src.coeff(i) * src.coeff(i);
+ VERIFY_IS_EQUAL(square, dst.coeff(i));
+ }
+}
+
+template <typename T, int NumDims, typename Device, bool Vectorizable,
+ TiledEvaluation Tiling, int Layout>
+static void test_async_execute_binary_expr(Device d)
+{
+ static constexpr int Options = 0 | Layout;
+
+ // Pick a large enough tensor size to bypass small tensor block evaluation
+ // optimization.
+ auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims);
+
+ Tensor<T, NumDims, Options, Index> lhs(dims);
+ Tensor<T, NumDims, Options, Index> rhs(dims);
+ Tensor<T, NumDims, Options, Index> dst(dims);
+
+ lhs.setRandom();
+ rhs.setRandom();
+
+ const auto expr = lhs + rhs;
+
+ Eigen::Barrier done(1);
+ auto on_done = [&done]() { done.Notify(); };
+
+ using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>;
+ using DoneCallback = decltype(on_done);
+ using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback,
+ Vectorizable, Tiling>;
+
+ Executor::runAsync(Assign(dst, expr), d, on_done);
+ done.Wait();
+
+ for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) {
+ T sum = lhs.coeff(i) + rhs.coeff(i);
+ VERIFY_IS_EQUAL(sum, dst.coeff(i));
+ }
+}
+
+#ifdef EIGEN_DONT_VECTORIZE
+#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL
+#else
+#define VECTORIZABLE(VAL) VAL
+#endif
+
+#define CALL_SUBTEST_PART(PART) \
+ CALL_SUBTEST_##PART
+
+#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
+
+// NOTE: Currently only ThreadPoolDevice supports async expression evaluation.
+#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \
+ CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device)))
+
+EIGEN_DECLARE_TEST(cxx11_tensor_executor) {
+ Eigen::DefaultDevice default_device;
+ // Default device is unused in ASYNC tests.
+ EIGEN_UNUSED_VARIABLE(default_device);
+
+ const auto num_threads = internal::random<int>(20, 24);
+ Eigen::ThreadPool tp(num_threads);
+ Eigen::ThreadPoolDevice tp_device(&tp, num_threads);
+
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4);
+ CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4);
+ CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4);
+ CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4);
+ CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4);
+ CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5);
+
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4);
+ CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5);
+
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5);
+
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4);
+ CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5);
+
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16
+}