aboutsummaryrefslogtreecommitdiff
path: root/unsupported/test/cxx11_tensor_thread_pool.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')
-rw-r--r--unsupported/test/cxx11_tensor_thread_pool.cpp390
1 files changed, 369 insertions, 21 deletions
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp
index 2ef665f30..b772a1d60 100644
--- a/unsupported/test/cxx11_tensor_thread_pool.cpp
+++ b/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -16,29 +16,72 @@
using Eigen::Tensor;
+class TestAllocator : public Allocator {
+ public:
+ ~TestAllocator() EIGEN_OVERRIDE {}
+ EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE {
+ const_cast<TestAllocator*>(this)->alloc_count_++;
+ return internal::aligned_malloc(num_bytes);
+ }
+ EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE {
+ const_cast<TestAllocator*>(this)->dealloc_count_++;
+ internal::aligned_free(buffer);
+ }
+
+ int alloc_count() const { return alloc_count_; }
+ int dealloc_count() const { return dealloc_count_; }
+
+ private:
+ int alloc_count_ = 0;
+ int dealloc_count_ = 0;
+};
void test_multithread_elementwise()
{
- Tensor<float, 3> in1(2,3,7);
- Tensor<float, 3> in2(2,3,7);
- Tensor<float, 3> out(2,3,7);
+ Tensor<float, 3> in1(200, 30, 70);
+ Tensor<float, 3> in2(200, 30, 70);
+ Tensor<double, 3> out(200, 30, 70);
in1.setRandom();
in2.setRandom();
Eigen::ThreadPool tp(internal::random<int>(3, 11));
Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
- out.device(thread_pool_device) = in1 + in2 * 3.14f;
+ out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>();
- for (int i = 0; i < 2; ++i) {
- for (int j = 0; j < 3; ++j) {
- for (int k = 0; k < 7; ++k) {
- VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+ for (int i = 0; i < 200; ++i) {
+ for (int j = 0; j < 30; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
}
}
}
}
+void test_async_multithread_elementwise()
+{
+ Tensor<float, 3> in1(200, 30, 70);
+ Tensor<float, 3> in2(200, 30, 70);
+ Tensor<double, 3> out(200, 30, 70);
+
+ in1.setRandom();
+ in2.setRandom();
+
+ Eigen::ThreadPool tp(internal::random<int>(3, 11));
+ Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+
+ Eigen::Barrier b(1);
+ out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>();
+ b.Wait();
+
+ for (int i = 0; i < 200; ++i) {
+ for (int j = 0; j < 30; ++j) {
+ for (int k = 0; k < 70; ++k) {
+ VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f));
+ }
+ }
+ }
+}
void test_multithread_compound_assignment()
{
@@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() {
}
}
+// Apply Sqrt to all output elements.
+struct SqrtOutputKernel {
+ template <typename Index, typename Scalar>
+ EIGEN_ALWAYS_INLINE void operator()(
+ const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+ const TensorContractionParams&, Index, Index, Index num_rows,
+ Index num_cols) const {
+ for (int i = 0; i < num_rows; ++i) {
+ for (int j = 0; j < num_cols; ++j) {
+ output_mapper(i, j) = std::sqrt(output_mapper(i, j));
+ }
+ }
+ }
+};
+
+template <int DataLayout>
+static void test_multithread_contraction_with_output_kernel() {
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(2, 11);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+ Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+ Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in mat4 to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 1500, 248);
+ MapXf m_right(t_right.data(), 248, 1400);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
+
+template<int DataLayout>
+void test_async_multithread_contraction_agrees_with_singlethread()
+{
+ int contract_size = internal::random<int>(100, 500);
+
+ Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40),
+ contract_size,
+ internal::random<int>(10, 40));
+
+ Tensor<float, 4, DataLayout> right(
+ internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size,
+ internal::random<int>(1, 20));
+
+ left.setRandom();
+ right.setRandom();
+
+ // add constants to shift values away from 0 for more precision
+ left += left.constant(1.5f);
+ right += right.constant(1.5f);
+
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+ Eigen::ThreadPool tp(internal::random<int>(2, 11));
+ Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32));
+
+ Tensor<float, 5, DataLayout> st_result;
+ st_result = left.contract(right, dims);
+
+ Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+
+ Eigen::Barrier barrier(1);
+ tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) =
+ left.contract(right, dims);
+ barrier.Wait();
+
+ VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+ for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+ // if both of the values are very small, then do nothing (because the test
+ // will fail due to numerical precision issues when values are small)
+ if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+ VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+ }
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims);
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel());
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ Eigen::Barrier barrier(1);
+ t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+ t_left.contract(t_right, dims);
+ barrier.Wait();
+
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+ }
+}
+
+// We are triggering 'evalShardedByInnerDim' optimization with output kernel.
+template <int DataLayout>
+static void test_async_sharded_by_inner_dim_contraction_with_output_kernel()
+{
+ typedef Tensor<float, 1>::DimensionPair DimPair;
+
+ const int num_threads = internal::random<int>(4, 16);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+ Tensor<float, 2, DataLayout> t_left(2, 10000);
+ Tensor<float, 2, DataLayout> t_right(10000, 10);
+ Tensor<float, 2, DataLayout> t_result(2, 10);
+
+ t_left.setRandom();
+ t_right.setRandom();
+ // Put trash in t_result to verify contraction clears output memory.
+ t_result.setRandom();
+
+ // Add a little offset so that the results won't be close to zero.
+ t_left += t_left.constant(1.0f);
+ t_right += t_right.constant(1.0f);
+
+ typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+ MapXf m_left(t_left.data(), 2, 10000);
+ MapXf m_right(t_right.data(), 10000, 10);
+ Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10);
+
+ // this contraction should be equivalent to a single matrix multiplication
+ Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}});
+
+ // compute results by separate methods
+ Eigen::Barrier barrier(1);
+ t_result.device(device, [&barrier]() { barrier.Notify(); }) =
+ t_left.contract(t_right, dims, SqrtOutputKernel());
+ barrier.Wait();
+ m_result = m_left * m_right;
+
+ for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) {
+ VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i]));
+ }
+}
template<int DataLayout>
void test_full_contraction() {
@@ -320,14 +630,14 @@ void test_multithread_random()
}
template<int DataLayout>
-void test_multithread_shuffle()
+void test_multithread_shuffle(Allocator* allocator)
{
Tensor<float, 4, DataLayout> tensor(17,5,7,11);
tensor.setRandom();
const int num_threads = internal::random<int>(2, 11);
ThreadPool threads(num_threads);
- Eigen::ThreadPoolDevice device(&threads, num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
@@ -344,10 +654,26 @@ void test_multithread_shuffle()
}
}
+void test_threadpool_allocate(TestAllocator* allocator)
+{
+ const int num_threads = internal::random<int>(2, 11);
+ const int num_allocs = internal::random<int>(2, 11);
+ ThreadPool threads(num_threads);
+ Eigen::ThreadPoolDevice device(&threads, num_threads, allocator);
+
+ for (int a = 0; a < num_allocs; ++a) {
+ void* ptr = device.allocate(512);
+ device.deallocate(ptr);
+ }
+ VERIFY(allocator != NULL);
+ VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs);
+ VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs);
+}
-void test_cxx11_tensor_thread_pool()
+EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool)
{
CALL_SUBTEST_1(test_multithread_elementwise());
+ CALL_SUBTEST_1(test_async_multithread_elementwise());
CALL_SUBTEST_1(test_multithread_compound_assignment());
CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
@@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool()
CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>());
+
+ CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>());
+ CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+ // Test EvalShardedByInnerDimContext parallelization strategy.
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
+
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>());
+ CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>());
// Exercise various cases that have been problematic in the past.
- CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
- CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+ CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>());
+ CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>());
+
+ CALL_SUBTEST_8(test_full_contraction<ColMajor>());
+ CALL_SUBTEST_8(test_full_contraction<RowMajor>());
+
+ CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>());
+ CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>());
- CALL_SUBTEST_4(test_full_contraction<ColMajor>());
- CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+ CALL_SUBTEST_10(test_memcpy());
+ CALL_SUBTEST_10(test_multithread_random());
- CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
- CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+ TestAllocator test_allocator;
+ CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL));
+ CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator));
+ CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator));
- CALL_SUBTEST_6(test_memcpy());
- CALL_SUBTEST_6(test_multithread_random());
- CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
- CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+ // Force CMake to split this test.
+ // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11
}