diff options
Diffstat (limited to 'unsupported/test/cxx11_tensor_thread_pool.cpp')
-rw-r--r-- | unsupported/test/cxx11_tensor_thread_pool.cpp | 390 |
1 files changed, 369 insertions, 21 deletions
diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 2ef665f30..b772a1d60 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -16,29 +16,72 @@ using Eigen::Tensor; +class TestAllocator : public Allocator { + public: + ~TestAllocator() EIGEN_OVERRIDE {} + EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE { + const_cast<TestAllocator*>(this)->alloc_count_++; + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE { + const_cast<TestAllocator*>(this)->dealloc_count_++; + internal::aligned_free(buffer); + } + + int alloc_count() const { return alloc_count_; } + int dealloc_count() const { return dealloc_count_; } + + private: + int alloc_count_ = 0; + int dealloc_count_ = 0; +}; void test_multithread_elementwise() { - Tensor<float, 3> in1(2,3,7); - Tensor<float, 3> in2(2,3,7); - Tensor<float, 3> out(2,3,7); + Tensor<float, 3> in1(200, 30, 70); + Tensor<float, 3> in2(200, 30, 70); + Tensor<double, 3> out(200, 30, 70); in1.setRandom(); in2.setRandom(); Eigen::ThreadPool tp(internal::random<int>(3, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); - out.device(thread_pool_device) = in1 + in2 * 3.14f; + out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>(); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); } } } } +void test_async_multithread_elementwise() +{ + Tensor<float, 3> in1(200, 30, 70); + Tensor<float, 3> in2(200, 30, 70); + Tensor<double, 3> out(200, 30, 70); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPool tp(internal::random<int>(3, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); + + Eigen::Barrier b(1); + out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>(); + b.Wait(); + + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); + } + } + } +} void test_multithread_compound_assignment() { @@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() { } } +// Apply Sqrt to all output elements. +struct SqrtOutputKernel { + template <typename Index, typename Scalar> + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, + const TensorContractionParams&, Index, Index, Index num_rows, + Index num_cols) const { + for (int i = 0; i < num_rows; ++i) { + for (int j = 0; j < num_cols; ++j) { + output_mapper(i, j) = std::sqrt(output_mapper(i, j)); + } + } + } +}; + +template <int DataLayout> +static void test_multithread_contraction_with_output_kernel() { + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31); + Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10); + Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in mat4 to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + +template<int DataLayout> +void test_async_multithread_contraction_agrees_with_singlethread() +{ + int contract_size = internal::random<int>(100, 500); + + Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40), + contract_size, + internal::random<int>(10, 40)); + + Tensor<float, 4, DataLayout> right( + internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size, + internal::random<int>(1, 20)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}}); + + Eigen::ThreadPool tp(internal::random<int>(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32)); + + Tensor<float, 5, DataLayout> st_result; + st_result = left.contract(right, dims); + + Tensor<float, 5, DataLayout> tp_result(st_result.dimensions()); + + Eigen::Barrier barrier(1); + tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) = + left.contract(right, dims); + barrier.Wait(); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test + // will fail due to numerical precision issues when values are small) + if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + +// We are triggering 'evalShardedByInnerDim' optimization. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization. +template <int DataLayout> +static void test_async_sharded_by_inner_dim_contraction() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims); + barrier.Wait(); + + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template <int DataLayout> +static void test_async_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims, SqrtOutputKernel()); + barrier.Wait(); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} template<int DataLayout> void test_full_contraction() { @@ -320,14 +630,14 @@ void test_multithread_random() } template<int DataLayout> -void test_multithread_shuffle() +void test_multithread_shuffle(Allocator* allocator) { Tensor<float, 4, DataLayout> tensor(17,5,7,11); tensor.setRandom(); const int num_threads = internal::random<int>(2, 11); ThreadPool threads(num_threads); - Eigen::ThreadPoolDevice device(&threads, num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); Tensor<float, 4, DataLayout> shuffle(7,5,11,17); array<ptrdiff_t, 4> shuffles = {{2,1,3,0}}; @@ -344,10 +654,26 @@ void test_multithread_shuffle() } } +void test_threadpool_allocate(TestAllocator* allocator) +{ + const int num_threads = internal::random<int>(2, 11); + const int num_allocs = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); + + for (int a = 0; a < num_allocs; ++a) { + void* ptr = device.allocate(512); + device.deallocate(ptr); + } + VERIFY(allocator != NULL); + VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs); + VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs); +} -void test_cxx11_tensor_thread_pool() +EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) { CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_async_multithread_elementwise()); CALL_SUBTEST_1(test_multithread_compound_assignment()); CALL_SUBTEST_2(test_multithread_contraction<ColMajor>()); @@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>()); + + CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>()); + CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>()); + + // Test EvalShardedByInnerDimContext parallelization strategy. + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); + + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); - CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); + CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>()); + + CALL_SUBTEST_8(test_full_contraction<ColMajor>()); + CALL_SUBTEST_8(test_full_contraction<RowMajor>()); + + CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>()); + CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>()); - CALL_SUBTEST_4(test_full_contraction<ColMajor>()); - CALL_SUBTEST_4(test_full_contraction<RowMajor>()); + CALL_SUBTEST_10(test_memcpy()); + CALL_SUBTEST_10(test_multithread_random()); - CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); - CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); + TestAllocator test_allocator; + CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL)); + CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator)); + CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator)); - CALL_SUBTEST_6(test_memcpy()); - CALL_SUBTEST_6(test_multithread_random()); - CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>()); - CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>()); + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11 } |