diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h new file mode 100644 index 000000000..f01d77c0a --- /dev/null +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -0,0 +1,288 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H +#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H + +namespace Eigen { + +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ +namespace internal { + +// Default strategy: the expression is evaluated with a single cpu thread. +template<typename Expression, typename Device, bool Vectorizable> +class TensorExecutor +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const Device& device = Device()) + { + TensorEvaluator<Expression, Device> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + +template<typename Expression> +class TensorExecutor<Expression, DefaultDevice, true> +{ + public: + typedef typename Expression::Index Index; + EIGEN_DEVICE_FUNC + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { + TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + const Index VectorizedSize = (size / PacketSize) * PacketSize; + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + evaluator.evalPacket(i); + } + for (Index i = VectorizedSize; i < size; ++i) { + evaluator.evalScalar(i); + } + } + evaluator.cleanup(); + } +}; + + + +// Multicore strategy: the index space is partitioned and each partition is executed on a single core +#ifdef EIGEN_USE_THREADS +template <typename Evaluator, typename Index, bool Vectorizable> +struct EvalRange { + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + for (Index i = first; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + return size; + } +}; + +template <typename Evaluator, typename Index> +struct EvalRange<Evaluator, Index, true> { + static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + + static void run(Evaluator* evaluator_in, const Index first, const Index last) { + Evaluator evaluator = *evaluator_in; + eigen_assert(last >= first); + Index i = first; + if (last - first >= PacketSize) { + eigen_assert(first % PacketSize == 0); + Index last_chunk_offset = last - 4 * PacketSize; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not + // unroll the loop at the expense of inlining. + for (; i <= last_chunk_offset; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { + evaluator.evalPacket(i + j * PacketSize); + } + } + last_chunk_offset = last - PacketSize; + for (; i <= last_chunk_offset; i += PacketSize) { + evaluator.evalPacket(i); + } + } + for (; i < last; ++i) { + evaluator.evalScalar(i); + } + } + + static Index alignBlockSize(Index size) { + // Align block size to packet size and account for unrolling in run above. + if (size >= 16 * PacketSize) { + return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); + } + // Aligning to 4 * PacketSize would increase block size by more than 25%. + return (size + PacketSize - 1) & ~(PacketSize - 1); + } +}; + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> { + public: + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator; + Evaluator evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last); + }); +#else + size_t num_threads = device.numThreads(); + if (num_threads > 1) { + num_threads = TensorCostModel<ThreadPoolDevice>::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); + } + if (num_threads == 1) { + EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size); + } else { + const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1; + Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange<Evaluator, Index, Vectorizable>::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); + } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) + } + evaluator.cleanup(); + } +}; +#endif // EIGEN_USE_THREADS + + +// GPU: the evaluation of the expression is offloaded to a GPU. +#if defined(EIGEN_USE_GPU) + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, GpuDevice, Vectorizable> { + public: + typedef typename Expression::Index Index; + static void run(const Expression& expr, const GpuDevice& device); +}; + + +#if defined(__CUDACC__) +template <typename Evaluator, typename Index, bool Vectorizable> +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +struct EigenMetaKernelEval<Evaluator, Index, true> { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; + + // Use the vector path + for (Index i = first * PacketSize; i < vectorized_size; + i += vectorized_step_size) { + eval.evalPacket(i); + } + for (Index i = vectorized_size + first; i < last; i += step_size) { + eval.evalScalar(i); + } + } +}; + +template <typename Evaluator, typename Index> +__global__ void +__launch_bounds__(1024) +EigenMetaKernel(Evaluator eval, Index size) { + + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; + + const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; + EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size); +} + +/*static*/ +template <typename Expression, bool Vectorizable> +inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run( + const Expression& expr, const GpuDevice& device) { + TensorEvaluator<Expression, GpuDevice> evaluator(expr, device); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) { + const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); + // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. + const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1); + + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>), + num_blocks, block_size, 0, device, evaluator, size); + } + evaluator.cleanup(); +} + +#endif // __CUDACC__ +#endif // EIGEN_USE_GPU + +// SYCL Executor policy +#ifdef EIGEN_USE_SYCL + +template <typename Expression, bool Vectorizable> +class TensorExecutor<Expression, SyclDevice, Vectorizable> { +public: + static inline void run(const Expression &expr, const SyclDevice &device) { + // call TensorSYCL module + TensorSycl::run(expr, device); + } +}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H |