aboutsummaryrefslogtreecommitdiff
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h744
1 files changed, 542 insertions, 202 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 3daecb045..474eba06f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,232 +11,572 @@
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
- * TensorSyclPlaceHolderExpr.h
+ * TensorReductionSycl.h
*
* \brief:
- * This is the specialisation of the placeholder expression based on the
- * operation type
+ * This is the specialization of the reduction operation. Two phase reduction approach
+ * is used since the GPU does not have Global Synchronization for global memory among
+ * different work-group/thread block. To solve the problem, we need to create two kernels
+ * to reduce the data, where the first kernel reduce the data locally and each local
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
*
-*****************************************************************/
+ *****************************************************************/
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
-
namespace Eigen {
+namespace TensorSycl {
namespace internal {
-template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
-template<typename BufferTOut, typename BufferTIn>
-static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
- do {
- auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
- cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
- cl::sycl::range<1>{std::min(length, local)}};
- /* Two accessors are used: one to the buffer that is being reduced,
- * and a second to local memory, used to store intermediate data. */
- auto aI =
- bufI.template get_access<cl::sycl::access::mode::read_write>(h);
- auto aOut =
- bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
- cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
- cl::sycl::access::target::local>
- scratch(cl::sycl::range<1>(local), h);
-
- /* The parallel_for invocation chosen is the variant with an nd_item
- * parameter, since the code requires barriers for correctness. */
- h.parallel_for<KernelName>(
- r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
- size_t globalid = id.get_global(0);
- size_t localid = id.get_local(0);
- /* All threads collectively read from global memory into local.
- * The barrier ensures all threads' IO is resolved before
- * execution continues (strictly speaking, all threads within
- * a single work-group - there is no co-ordination between
- * work-groups, only work-items). */
- if (globalid < length) {
- scratch[localid] = aI[globalid];
- }
- id.barrier(cl::sycl::access::fence_space::local_space);
-
- /* Apply the reduction operation between the current local
- * id and the one on the other half of the vector. */
- if (globalid < length) {
- int min = (length < local) ? length : local;
- for (size_t offset = min / 2; offset > 0; offset /= 2) {
- if (localid < offset) {
- scratch[localid] += scratch[localid + offset];
- }
- id.barrier(cl::sycl::access::fence_space::local_space);
- }
- /* The final result will be stored in local id 0. */
- if (localid == 0) {
- aI[id.get_group(0)] = scratch[localid];
- if((length<=local) && globalid ==0){
- aOut[globalid]=scratch[localid];
- }
- }
- }
- });
- };
- dev.m_queue.submit(f);
- dev.m_queue.throw_asynchronous();
-
- /* At this point, you could queue::wait_and_throw() to ensure that
- * errors are caught quickly. However, this would likely impact
- * performance negatively. */
- length = length / local;
-
- } while (length > 1);
-
-
-
-}
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+ typedef Op type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &) {
+ return accumulator;
+ }
};
-/// For now let's start with a full reducer
-/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
-/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
-/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
-// a leafNode.
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+ const Index &scale) {
+ ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+ return quotient_op(accumulator, CoeffReturnType(scale));
+ }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+ typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+ typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+ return type();
+ }
+
+ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+ const Index &scale) {
+ return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+ }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+ Index local_range>
+struct SecondStepFullReducer {
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+ typedef typename OpDef::type Op;
+ LocalAccessor scratch;
+ InputAccessor aI;
+ OutputAccessor outAcc;
+ Op op;
+ SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+ : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ // Our empirical research shows that the best performance will be achieved
+ // when there is only one element per thread to reduce in the second step.
+ // in this step the second step reduction time is almost negligible.
+ // Hence, in the second step of reduction the input size is fixed to the
+ // local size, thus, there is only one element read per thread. The
+ // algorithm must be changed if the number of reduce per thread in the
+ // second step is greater than 1. Otherwise, the result will be wrong.
+ const Index localid = itemID.get_local_id(0);
+ auto aInPtr = aI.get_pointer() + localid;
+ auto aOutPtr = outAcc.get_pointer();
+ CoeffReturnType *scratchptr = scratch.get_pointer();
+ CoeffReturnType accumulator = *aInPtr;
+
+ scratchptr[localid] = op.finalize(accumulator);
+ for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratchptr[localid + offset], &accumulator);
+ scratchptr[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) *aOutPtr = op.finalize(accumulator);
+ }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept
+// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+ (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ OpDef;
+
+ typedef typename OpDef::type Op;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::PacketReturnType PacketReturnType;
+ typedef
+ typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
+ PacketReturnType, CoeffReturnType>::type OutType;
+ typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ LocalAccessor;
+ LocalAccessor scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType final_output;
+ Index rng;
+ Op op;
+
+ FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+ Index rng_, OpType op_)
+ : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
+
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+ Index start = Evaluator::PacketSize * globalid;
+ // vectorizable parts
+ PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+ for (Index i = start; i < VectorizedRange; i += step) {
+ op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+ }
+ globalid += VectorizedRange;
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.template reducePacket<PacketReturnType>(
+ ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+ evaluator.impl().coeff(i), op.initialize()),
+ &packetAccumulator);
+ }
+ scratch[localid] = packetAccumulator =
+ OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+ // reduction parts // Local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+ scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] =
+ op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+ }
+ }
+
+ template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
+ const cl::sycl::nd_item<1> &itemID) {
+ auto output_ptr = final_output.get_pointer();
+ Index globalid = itemID.get_global_id(0);
+ Index localid = itemID.get_local_id(0);
+ // vectorizable parts
+ CoeffReturnType accumulator = op.initialize();
+ // non vectorizable parts
+ for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+ op.reduce(evaluator.impl().coeff(i), &accumulator);
+ }
+ scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+ // reduction parts. the local size is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (localid < offset) {
+ op.reduce(scratch[localid + offset], &accumulator);
+ scratch[localid] = op.finalize(accumulator);
+ }
+ }
+ if (localid == 0) {
+ output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+ }
+ }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ template <typename Scratch>
+ GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+ Index range_, Index num_values_to_reduce_)
+ : evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ functor(OpDef::get_op(functor_)),
+ range(range_),
+ num_values_to_reduce(num_values_to_reduce_) {}
+
+ void operator()(cl::sycl::nd_item<1> itemID) {
+ auto output_accessor_ptr = output_accessor.get_pointer();
+ /// const cast added as a naive solution to solve the qualifier drop error
+ Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+ if (globalid < range) {
+ CoeffReturnType accum = functor.initialize();
+ Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+ evaluator, evaluator.firstInput(globalid), functor, &accum);
+ output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+ }
+ }
+
+ private:
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op functor;
+ Index range;
+ Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+ typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+ typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+ typedef typename Evaluator::Index Index;
+ typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAcc;
+ ScratchAcc scratch;
+ Evaluator evaluator;
+ EvaluatorPointerType output_accessor;
+ Op op;
+ const Index preserve_elements_num_groups;
+ const Index reduce_elements_num_groups;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+ const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+ const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+ : scratch(scratch_),
+ evaluator(evaluator_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ preserve_elements_num_groups(preserve_elements_num_groups_),
+ reduce_elements_num_groups(reduce_elements_num_groups_),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+ CoeffReturnType &accumulator) {
+ if (globalPId >= num_coeffs_to_preserve) {
+ return;
+ }
+ Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+ : globalRId + (globalPId * num_coeffs_to_reduce);
+ Index localOffset = globalRId;
+
+ const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+ const Index per_thread_global_stride =
+ rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+ for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+ op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+ localOffset += per_thread_local_stride;
+ global_offset += per_thread_global_stride;
+ }
+ }
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index linearLocalThreadId = itemID.get_local_id(0);
+ Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+ Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+ : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+ const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+ : itemID.get_group(0) / reduce_elements_num_groups;
+ const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+ : itemID.get_group(0) % reduce_elements_num_groups;
+
+ Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+ auto scratchPtr = scratch.get_pointer().get();
+ auto outPtr =
+ output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+ CoeffReturnType accumulator = op.initialize();
+
+ element_wise_reduce(globalRId, globalPId, accumulator);
+ accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+ scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+ accumulator;
+ if (rt == reduction_dim::inner_most) {
+ pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+ rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+ globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+ }
+
+ /* Apply the reduction operation between the current local
+ * id and the one on the other half of the vector. */
+ auto out_scratch_ptr =
+ scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ if (rt == reduction_dim::inner_most) {
+ accumulator = *out_scratch_ptr;
+ }
+ // The Local LocalThreadSizeR is always power of 2
+ EIGEN_UNROLL_LOOP
+ for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+ if (rLocalThreadId < offset) {
+ op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+ // The result has already been divided for mean reducer in the
+ // previous reduction so no need to divide furthermore
+ *out_scratch_ptr = op.finalize(accumulator);
+ }
+ /* All threads collectively read from global memory into local.
+ * The barrier ensures all threads' IO is resolved before
+ * execution continues (strictly speaking, all threads within
+ * a single work-group - there is no co-ordination between
+ * work-groups, only work-items). */
+ itemID.barrier(cl::sycl::access::fence_space::local_space);
+ }
+
+ if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+ outPtr[globalPId] = op.finalize(accumulator);
+ }
+ }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+ typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+ typedef typename OpDef::type Op;
+ typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+ ScratchAccessor;
+ InputAccessor input_accessor;
+ OutputAccessor output_accessor;
+ Op op;
+ const Index num_coeffs_to_preserve;
+ const Index num_coeffs_to_reduce;
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+ OutputAccessor output_accessor_, OpType op_,
+ const Index num_coeffs_to_preserve_,
+ const Index num_coeffs_to_reduce_)
+ : input_accessor(input_accessor_),
+ output_accessor(output_accessor_),
+ op(OpDef::get_op(op_)),
+ num_coeffs_to_preserve(num_coeffs_to_preserve_),
+ num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
+ const Index globalId = itemID.get_global_id(0);
+
+ if (globalId >= num_coeffs_to_preserve) return;
+
+ auto in_ptr = input_accessor.get_pointer() + globalId;
+
+ OutScalar accumulator = op.initialize();
+// num_coeffs_to_reduce is not bigger that 256
+ for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+ op.reduce(*in_ptr, &accumulator);
+ in_ptr += num_coeffs_to_preserve;
+ }
+ output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
+ }
+}; // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+ static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
+ static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
+ static EIGEN_CONSTEXPR bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
typedef typename Self::CoeffReturnType CoeffReturnType;
- static const bool HasOptimizedImplementation = false;
-
- static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
- typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- auto functors = TensorSycl::internal::extractFunctors(self.impl());
- int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
- size_t inputSize =self.impl().dimensions().TotalSize();
- size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
- size_t remaining = inputSize% red_factor;
- if(rng ==0) {
- red_factor=1;
- };
- size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
- size_t GRange=std::max((size_t )1, rng);
-
- // convert global range to power of 2 for redecution
- GRange--;
- GRange |= GRange >> 1;
- GRange |= GRange >> 2;
- GRange |= GRange >> 4;
- GRange |= GRange >> 8;
- GRange |= GRange >> 16;
-#if __x86_64__ || __ppc64__ || _WIN64
- GRange |= GRange >> 32;
-#endif
- GRange++;
- size_t outTileSize = tileSize;
- /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
- if (GRange < outTileSize) outTileSize=GRange;
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
- /// creating the shared memory for calculating reduction.
- /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
- /// recursively apply reduction on it in order to reduce the whole.
- auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
- Dims dims= self.xprDims();
- Op functor = reducer;
- dev.m_queue.submit([&](cl::sycl::handler &cgh) {
- // create a tuple of accessors from Evaluator
- auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
-
- cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
-
- if(globalid<rng)
- tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
- else
- tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
-
- if(remaining!=0 && globalid==0 )
- // this will add the rest of input buffer when the input size is not devidable to red_factor.
- tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
- });
- });
- dev.m_queue.throw_asynchronous();
-
-/// This is used to recursively reduce the tmp value to an element of 1;
- syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange, outTileSize);
+ typedef typename Self::Storage Storage;
+ typedef typename Self::Index Index;
+ typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+ PannelParameters;
+
+ typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+ Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+ Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+ // getPowerOfTwo makes sure local range is power of 2 and <=
+ // maxSyclThreadPerBlock this will help us to avoid extra check on the
+ // kernel
+ static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+ (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+
+ EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+ // In this step, we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 64
+ // elemnts individually, we get better performance. However, this can change
+ // on different platforms. In this step we force the code not to be
+ // morthan step reduction: Our empirical research shows that for inner_most
+ // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+ // > 1024 to achieve the best performance.
+ const Index reductionPerThread = 64;
+ Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+ const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+ Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+ const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+ const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+ EIGEN_CONSTEXPR Index scratchSize =
+ PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+ if (rNumGroups > 1) {
+ CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+ dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+ EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+
+ typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+ SecondStepPartialReductionKernel;
+
+ dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+ temp_accessor, output,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
+ reducer, num_coeffs_to_preserve, rNumGroups);
+
+ self.device().deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+ self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+ num_coeffs_to_reduce);
+ }
+ return false;
+ }
+};
+} // namespace internal
+} // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+ typedef typename Self::CoeffReturnType CoeffReturnType;
+ typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+ static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+ typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
+ static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+ (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+ "The Local thread size must be a power of 2 for the reduction "
+ "operation");
+ EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+ typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+ // In this step we force the code not to be more than 2-step reduction:
+ // Our empirical research shows that if each thread reduces at least 512
+ // elemnts individually, we get better performance.
+ const Index reductionPerThread = 2048;
+ // const Index num_work_group =
+ Index reductionGroup = dev.getPowerOfTwo(
+ (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+ const Index num_work_group = std::min(reductionGroup, local_range);
+ // 1
+ // ? local_range
+ // : 1);
+ const Index global_range = num_work_group * local_range;
+
+ auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+ typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+ if (num_work_group > 1) {
+ CoeffReturnType *temp_pointer =
+ static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+ typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+ local_range, inputSize, reducer);
+
+ typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+ EvaluatorPointerType, Index, local_range>
+ GenericRKernel;
+ dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+ tmp_global_accessor, data,
+ cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
+ reducer);
+
+ dev.deallocate_temp(temp_pointer);
+ } else {
+ dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+ reducer);
+ }
+ }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
}
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+ Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+ num_coeffs_to_reduce,
+ num_coeffs_to_preserve);
+ }
};
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
template <typename Self, typename Op>
-struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+ static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
+ static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+ typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+ typename Self::Index num_coeffs_to_preserve) {
+ typename Self::Index range, GRange, tileSize;
+ dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
- typedef typename Self::CoeffReturnType CoeffReturnType;
- static const bool HasOptimizedImplementation = false;
-
- static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
- typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
- typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
- auto functors = TensorSycl::internal::extractFunctors(self.impl());
-
- size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
-
- size_t GRange=num_coeffs_to_preserve;
- if (tileSize>GRange) tileSize=GRange;
- else if(GRange>tileSize){
- size_t xMode = GRange % tileSize;
- if (xMode != 0) GRange += (tileSize - xMode);
- }
- // getting final out buffer at the moment the created buffer is true because there is no need for assign
- /// creating the shared memory for calculating reduction.
- /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
- /// recursively apply reduction on it in order to reduce the whole.
- typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
- Dims dims= self.xprDims();
- Op functor = reducer;
-
- dev.m_queue.submit([&](cl::sycl::handler &cgh) {
- // create a tuple of accessors from Evaluator
- auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
- auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
-
- cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
- typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
- auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
- /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
- /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
- /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
- const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
- /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
- /// the device_evaluator is detectable and recognisable on the device.
- typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
- auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
- /// const cast added as a naive solution to solve the qualifier drop error
- auto globalid=itemID.get_global_linear_id();
- if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
- typename DeiceSelf::CoeffReturnType accum = functor.initialize();
- GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
- functor.finalize(accum);
- output_accessor.get_pointer()[globalid]= accum;
- }
- });
- });
- dev.m_queue.throw_asynchronous();
+ dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+ TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+ self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+ reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
return false;
}
};
-} // end namespace internal
+} // namespace internal
} // namespace Eigen
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP