aboutsummaryrefslogtreecommitdiff
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h288
1 files changed, 288 insertions, 0 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 000000000..f01d77c0a
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,288 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+namespace Eigen {
+
+/** \class TensorExecutor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ */
+namespace internal {
+
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device, bool Vectorizable>
+class TensorExecutor
+{
+ public:
+ typedef typename Expression::Index Index;
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr, const Device& device = Device())
+ {
+ TensorEvaluator<Expression, Device> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ for (Index i = 0; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
+ public:
+ typedef typename Expression::Index Index;
+ EIGEN_DEVICE_FUNC
+ static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+ {
+ TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+ const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
+ const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+ for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
+ }
+ const Index VectorizedSize = (size / PacketSize) * PacketSize;
+ for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ for (Index i = VectorizedSize; i < size; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+ evaluator.cleanup();
+ }
+};
+
+
+
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
+#ifdef EIGEN_USE_THREADS
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EvalRange {
+ static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(last >= first);
+ for (Index i = first; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+
+ static Index alignBlockSize(Index size) {
+ return size;
+ }
+};
+
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+ static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
+ static void run(Evaluator* evaluator_in, const Index first, const Index last) {
+ Evaluator evaluator = *evaluator_in;
+ eigen_assert(last >= first);
+ Index i = first;
+ if (last - first >= PacketSize) {
+ eigen_assert(first % PacketSize == 0);
+ Index last_chunk_offset = last - 4 * PacketSize;
+ // Give the compiler a strong hint to unroll the loop. But don't insist
+ // on unrolling, because if the function is expensive the compiler should not
+ // unroll the loop at the expense of inlining.
+ for (; i <= last_chunk_offset; i += 4*PacketSize) {
+ for (Index j = 0; j < 4; j++) {
+ evaluator.evalPacket(i + j * PacketSize);
+ }
+ }
+ last_chunk_offset = last - PacketSize;
+ for (; i <= last_chunk_offset; i += PacketSize) {
+ evaluator.evalPacket(i);
+ }
+ }
+ for (; i < last; ++i) {
+ evaluator.evalScalar(i);
+ }
+ }
+
+ static Index alignBlockSize(Index size) {
+ // Align block size to packet size and account for unrolling in run above.
+ if (size >= 16 * PacketSize) {
+ return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+ }
+ // Aligning to 4 * PacketSize would increase block size by more than 25%.
+ return (size + PacketSize - 1) & ~(PacketSize - 1);
+ }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
+ public:
+ typedef typename Expression::Index Index;
+ static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+ {
+ typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+ Evaluator evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign)
+ {
+ const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+ device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+ EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+ [&evaluator](Index first, Index last) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+ });
+#else
+ size_t num_threads = device.numThreads();
+ if (num_threads > 1) {
+ num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+ size, evaluator.costPerCoeff(Vectorizable), num_threads);
+ }
+ if (num_threads == 1) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+ } else {
+ const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+ Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+ const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+ const Index numblocks = size / blocksize;
+
+ Barrier barrier(numblocks);
+ for (int i = 0; i < numblocks; ++i) {
+ device.enqueue_with_barrier(
+ &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+ &evaluator, i * blocksize, (i + 1) * blocksize);
+ }
+ if (numblocks * blocksize < size) {
+ EvalRange<Evaluator, Index, Vectorizable>::run(
+ &evaluator, numblocks * blocksize, size);
+ }
+ barrier.Wait();
+ }
+#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
+ }
+ evaluator.cleanup();
+ }
+};
+#endif // EIGEN_USE_THREADS
+
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable> {
+ public:
+ typedef typename Expression::Index Index;
+ static void run(const Expression& expr, const GpuDevice& device);
+};
+
+
+#if defined(__CUDACC__)
+template <typename Evaluator, typename Index, bool Vectorizable>
+struct EigenMetaKernelEval {
+ static __device__ EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, Index first, Index last, Index step_size) {
+ for (Index i = first; i < last; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename Index>
+struct EigenMetaKernelEval<Evaluator, Index, true> {
+ static __device__ EIGEN_ALWAYS_INLINE
+ void run(Evaluator& eval, Index first, Index last, Index step_size) {
+ const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+ const Index vectorized_size = (last / PacketSize) * PacketSize;
+ const Index vectorized_step_size = step_size * PacketSize;
+
+ // Use the vector path
+ for (Index i = first * PacketSize; i < vectorized_size;
+ i += vectorized_step_size) {
+ eval.evalPacket(i);
+ }
+ for (Index i = vectorized_size + first; i < last; i += step_size) {
+ eval.evalScalar(i);
+ }
+ }
+};
+
+template <typename Evaluator, typename Index>
+__global__ void
+__launch_bounds__(1024)
+EigenMetaKernel(Evaluator eval, Index size) {
+
+ const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+ const Index step_size = blockDim.x * gridDim.x;
+
+ const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+ EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
+ const Expression& expr, const GpuDevice& device) {
+ TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+ const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+ if (needs_assign) {
+ const int block_size = device.maxCudaThreadsPerBlock();
+ const int max_blocks = device.getNumCudaMultiProcessors() *
+ device.maxCudaThreadsPerMultiProcessor() / block_size;
+ const Index size = array_prod(evaluator.dimensions());
+ // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+ const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
+
+ LAUNCH_CUDA_KERNEL(
+ (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
+ num_blocks, block_size, 0, device, evaluator, size);
+ }
+ evaluator.cleanup();
+}
+
+#endif // __CUDACC__
+#endif // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, SyclDevice, Vectorizable> {
+public:
+ static inline void run(const Expression &expr, const SyclDevice &device) {
+ // call TensorSYCL module
+ TensorSycl::run(expr, device);
+ }
+};
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H