unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242

// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli    Codeplay Software Ltd.
// Ralph Potter  Codeplay Software Ltd.
// Luke Iwanski  Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

/*****************************************************************
 * TensorSyclPlaceHolderExpr.h
 *
 * \brief:
 *  This is the specialisation of the placeholder expression based on the
 * operation type
 *
*****************************************************************/

#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

namespace Eigen {
namespace internal {

template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
template<typename BufferTOut, typename BufferTIn>
static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
  do {
          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
                                    cl::sycl::range<1>{std::min(length, local)}};
            /* Two accessors are used: one to the buffer that is being reduced,
             * and a second to local memory, used to store intermediate data. */
            auto aI =
                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
            auto aOut =
                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
                               cl::sycl::access::target::local>
                scratch(cl::sycl::range<1>(local), h);

            /* The parallel_for invocation chosen is the variant with an nd_item
             * parameter, since the code requires barriers for correctness. */
            h.parallel_for<KernelName>(
                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
                  size_t globalid = id.get_global(0);
                  size_t localid = id.get_local(0);
                  /* All threads collectively read from global memory into local.
                   * The barrier ensures all threads' IO is resolved before
                   * execution continues (strictly speaking, all threads within
                   * a single work-group - there is no co-ordination between
                   * work-groups, only work-items). */
                  if (globalid < length) {
                    scratch[localid] = aI[globalid];
                  }
                  id.barrier(cl::sycl::access::fence_space::local_space);

                  /* Apply the reduction operation between the current local
                   * id and the one on the other half of the vector. */
                  if (globalid < length) {
                    int min = (length < local) ? length : local;
                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
                      if (localid < offset) {
                        scratch[localid] += scratch[localid + offset];
                      }
                      id.barrier(cl::sycl::access::fence_space::local_space);
                    }
                    /* The final result will be stored in local id 0. */
                    if (localid == 0) {
                      aI[id.get_group(0)] = scratch[localid];
                      if((length<=local) && globalid ==0){
                        aOut[globalid]=scratch[localid];
                      }
                    }
                  }
                });
          };
            dev.m_queue.submit(f);
            dev.m_queue.throw_asynchronous();

          /* At this point, you could queue::wait_and_throw() to ensure that
           * errors are caught quickly. However, this would likely impact
           * performance negatively. */
          length = length / local;

        } while (length > 1);


}

};

/// For now let's start with a full reducer
/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
// a leafNode.
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {

  typedef typename Self::CoeffReturnType CoeffReturnType;
  static const bool HasOptimizedImplementation = false;

  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
    auto functors = TensorSycl::internal::extractFunctors(self.impl());
    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
    size_t inputSize =self.impl().dimensions().TotalSize();
    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
    size_t remaining = inputSize% red_factor;
    if(rng ==0) {
      red_factor=1;
    };
    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
    size_t GRange=std::max((size_t )1, rng);

    // convert global range to power of 2 for redecution
    GRange--;
    GRange |= GRange >> 1;
    GRange |= GRange >> 2;
    GRange |= GRange >> 4;
    GRange |= GRange >> 8;
    GRange |= GRange >> 16;
#if __x86_64__ || __ppc64__ || _WIN64
    GRange |= GRange >> 32;
#endif
    GRange++;
    size_t  outTileSize = tileSize;
    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
    if (GRange < outTileSize) outTileSize=GRange;
    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
    /// creating the shared memory for calculating reduction.
    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
    /// recursively apply reduction on it in order to reduce the whole.
    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
    Dims dims= self.xprDims();
    Op functor = reducer;
    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
      // create a tuple of accessors from Evaluator
      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);

      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
        /// the device_evaluator is detectable and recognisable on the device.
        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
        /// const cast added as a naive solution to solve the qualifier drop error
        auto globalid=itemID.get_global_linear_id();

        if(globalid<rng)
          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
        else
          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);

        if(remaining!=0 && globalid==0 )
          // this will add the rest of input buffer when the input size is not devidable to red_factor.
          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
      });
    });
  dev.m_queue.throw_asynchronous();

/// This is used to recursively reduce the tmp value to an element of 1;
  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
  }

};

template <typename Self, typename Op>
struct InnerReducer<Self, Op, const Eigen::SyclDevice> {

  typedef typename Self::CoeffReturnType CoeffReturnType;
  static const bool HasOptimizedImplementation = false;

  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
    auto functors = TensorSycl::internal::extractFunctors(self.impl());

    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;

    size_t GRange=num_coeffs_to_preserve;
    if (tileSize>GRange) tileSize=GRange;
    else if(GRange>tileSize){
      size_t xMode = GRange % tileSize;
      if (xMode != 0) GRange += (tileSize - xMode);
    }
    // getting final out buffer at the moment the created buffer is true because there is no need for assign
    /// creating the shared memory for calculating reduction.
    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
    /// recursively apply reduction on it in order to reduce the whole.
    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
    Dims dims= self.xprDims();
    Op functor = reducer;

    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
      // create a tuple of accessors from Evaluator
      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);

      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
        /// the device_evaluator is detectable and recognisable on the device.
        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
        /// const cast added as a naive solution to solve the qualifier drop error
        auto globalid=itemID.get_global_linear_id();
        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
          functor.finalize(accum);
          output_accessor.get_pointer()[globalid]= accum;
        }
      });
    });
  dev.m_queue.throw_asynchronous();
    return false;
  }
};

}  // end namespace internal
}  // namespace Eigen

#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP