diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 630 |
1 files changed, 490 insertions, 140 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 834ce07df..3aff7fa01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,44 +32,72 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef Derived XprType; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; + typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType; + typedef StorageMemory<Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), + BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, + Layout, Index> + TensorBlock; + //===--------------------------------------------------------------------===// + + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get((const_cast<TensorPointerType>(m.data())))), + m_dims(m.dimensions()), + m_device(device) { } - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { - if (dest) { - m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) { + m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { + eigen_assert(m_data != NULL); return m_data[index]; } @@ -79,6 +107,18 @@ struct TensorEvaluator return internal::ploadt<PacketReturnType, LoadMode>(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const + { + return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); + } + template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -86,7 +126,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -94,8 +134,9 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) { - eigen_assert(m_data); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& + coeffRef(const array<DenseIndex, NumCoords>& coords) { + eigen_assert(m_data != NULL); if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -105,19 +146,50 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits<PacketReturnType>::size); + PacketType<CoeffReturnType, Device>::size); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); } - EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + + template<typename TensorBlock> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( + const TensorBlockDesc& desc, const TensorBlock& block) { + assert(m_data != NULL); + + typedef typename TensorBlock::XprType TensorBlockExpr; + typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr, + Index> + TensorBlockAssign; - /// required by sycl in order to construct sycl buffer from raw pointer - const Device& device() const{return m_device;} + TensorBlockAssign::Run( + TensorBlockAssign::target(desc.dimensions(), + internal::strides<Layout>(m_dims), m_data, + desc.offset()), + block.expr()); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; namespace { @@ -126,7 +198,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); @@ -140,6 +212,13 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif +#ifdef EIGEN_USE_SYCL +// overload of load constant should be implemented here based on range access +template <cl::sycl::access::mode AcMd, typename T> +T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) { + return *address; +} +#endif } @@ -152,40 +231,64 @@ struct TensorEvaluator<const Derived, Device> typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; + typedef const Derived XprType; + typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType; + typedef StorageMemory<const Scalar, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1), - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), + BlockAccess = internal::is_arithmetic<ScalarNoConst>::value, + PreferBlockAccess = false, + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - // Used for accessor extraction in SYCL Managed TensorMap: - const Derived& derived() const { return m_impl; } + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, + Layout, Index> + TensorBlock; + //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) { - m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType dest, EvalSubExprsCallback done) { + // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. + done(evalSubExprsIfNeeded(dest)); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); return loadConstant(m_data+index); } @@ -195,8 +298,20 @@ struct TensorEvaluator<const Derived, Device> return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index); } + // Return a packet starting at `index` where `umask` specifies which elements + // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for + // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding + // float element will be loaded, otherwise 0 will be loaded. + // Function has been templatized to enable Sfinae. + template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type + partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const + { + return internal::ploadu<PacketReturnTypeT>(m_data + index, umask); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const { - eigen_assert(m_data); + eigen_assert(m_data != NULL); const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -204,19 +319,32 @@ struct TensorEvaluator<const Derived, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits<PacketReturnType>::size); + PacketType<CoeffReturnType, Device>::size); } - EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + return internal::TensorBlockResourceRequirements::any(); + } - /// added for sycl in order to construct the buffer from the sycl device - const Device& device() const{return m_device;} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + assert(m_data != NULL); + return TensorBlock::materialize(m_data, m_dims, desc, scratch); + } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_data.bind(cgh); + } +#endif protected: - typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data; + EvaluatorPointerType m_data; Dimensions m_dims; - const Device& m_device; - const Derived& m_impl; + const Device EIGEN_DEVICE_REF m_device; }; @@ -229,15 +357,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> { typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType; - enum { - IsAligned = true, - PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -246,13 +365,42 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + enum { + IsAligned = true, + PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess + #ifdef EIGEN_USE_SYCL + && (PacketType<CoeffReturnType, Device>::size >1) + #endif + , + BlockAccess = false, + PreferBlockAccess = false, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + done(true); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -268,16 +416,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - internal::unpacket_traits<PacketReturnType>::size); + PacketType<CoeffReturnType, Device>::size); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; } - /// required by sycl in order to extract the accessor - NullaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_argImpl.bind(cgh); + } +#endif private: const NullaryOp m_functor; @@ -295,32 +444,60 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType; enum { - IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess, - Layout = TensorEvaluator<ArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator<ArgType, Device>::IsAligned, + PacketAccess = int(TensorEvaluator<ArgType, Device>::PacketAccess) & + int(internal::functor_traits<UnaryOp>::PacketAccess), + BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess, + PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess, + Layout = TensorEvaluator<ArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const<Scalar>::type ScalarNoConst; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + static const int NumDims = internal::array_size<Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock + ArgTensorBlock; + + typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock> + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -341,15 +518,31 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits<UnaryOp>::Cost; + return m_argImpl.getResourceRequirements().addCostPerCoeff( + {0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + return TensorBlock(m_argImpl.block(desc, scratch), m_functor); + } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; } - /// added for sycl in order to construct the buffer from sycl device - UnaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ + m_argImpl.bind(cgh); + } +#endif private: + const Device EIGEN_DEVICE_REF m_device; const UnaryOp m_functor; TensorEvaluator<ArgType, Device> m_argImpl; }; @@ -363,16 +556,23 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType; enum { - IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess & - internal::functor_traits<BinaryOp>::PacketAccess, - Layout = TensorEvaluator<LeftArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) & + int(TensorEvaluator<RightArgType, Device>::IsAligned), + PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) & + int(TensorEvaluator<RightArgType, Device>::PacketAccess) & + int(internal::functor_traits<BinaryOp>::PacketAccess), + BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) & + int(TensorEvaluator<RightArgType, Device>::BlockAccess), + PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) | + int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess), + Layout = TensorEvaluator<LeftArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_functor(op.functor()), + TensorEvaluator(const XprType& op, const Device& device) + : m_device(device), + m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -384,8 +584,27 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size< + typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock + LeftTensorBlock; + typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock + RightTensorBlock; + + typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock, + RightTensorBlock> + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -393,12 +612,25 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg return m_leftImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + // TODO(ezhulenev): Evaluate two expression in parallel? + m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { + m_rightImpl.evalSubExprsIfNeededAsync(nullptr, + [done](bool) { done(true); }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } @@ -421,15 +653,34 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; } - /// required by sycl in order to extract the accessor - BinaryOp functor() const { return m_functor; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + static const double functor_cost = internal::functor_traits<BinaryOp>::Cost; + return internal::TensorBlockResourceRequirements::merge( + m_leftImpl.getResourceRequirements(), + m_rightImpl.getResourceRequirements()) + .addCostPerCoeff({0, 0, functor_cost / PacketSize}); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + desc.DropDestinationBuffer(); + return TensorBlock(m_leftImpl.block(desc, scratch), + m_rightImpl.block(desc, scratch), m_functor); + } + + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + #ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_leftImpl.bind(cgh); + m_rightImpl.bind(cgh); + } + #endif private: + const Device EIGEN_DEVICE_REF m_device; const BinaryOp m_functor; TensorEvaluator<LeftArgType, Device> m_leftImpl; TensorEvaluator<RightArgType, Device> m_rightImpl; @@ -444,14 +695,20 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, enum { IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned, - PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess & - internal::functor_traits<TernaryOp>::PacketAccess, - Layout = TensorEvaluator<Arg1Type, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess && + TensorEvaluator<Arg2Type, Device>::PacketAccess && + TensorEvaluator<Arg3Type, Device>::PacketAccess && + internal::functor_traits<TernaryOp>::PacketAccess, + BlockAccess = false, + PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess || + TensorEvaluator<Arg2Type, Device>::PreferBlockAccess || + TensorEvaluator<Arg3Type, Device>::PreferBlockAccess, + Layout = TensorEvaluator<Arg1Type, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_arg1Impl(op.arg1Expression(), device), m_arg2Impl(op.arg2Expression(), device), @@ -479,8 +736,14 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, typedef typename XprType::Scalar Scalar; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockNotImplemented TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -488,13 +751,13 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, return m_arg1Impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_arg1Impl.evalSubExprsIfNeeded(NULL); m_arg2Impl.evalSubExprsIfNeeded(NULL); m_arg3Impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_arg1Impl.cleanup(); m_arg2Impl.cleanup(); m_arg3Impl.cleanup(); @@ -521,14 +784,16 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; } +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_arg1Impl.bind(cgh); + m_arg2Impl.bind(cgh); + m_arg3Impl.bind(cgh); + } +#endif private: const TernaryOp m_functor; @@ -547,15 +812,23 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned, - PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess & - internal::packet_traits<Scalar>::HasBlend, - Layout = TensorEvaluator<IfArgType, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & + TensorEvaluator<ElseArgType, Device>::IsAligned, + PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & + TensorEvaluator<ElseArgType, Device>::PacketAccess & + PacketType<Scalar, Device>::HasBlend, + BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess && + TensorEvaluator<ThenArgType, Device>::BlockAccess && + TensorEvaluator<ElseArgType, Device>::BlockAccess, + PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess || + TensorEvaluator<ThenArgType, Device>::PreferBlockAccess || + TensorEvaluator<ElseArgType, Device>::PreferBlockAccess, + Layout = TensorEvaluator<IfArgType, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) @@ -569,8 +842,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> typedef typename XprType::Index Index; typedef typename internal::traits<XprType>::Scalar CoeffReturnType; typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; - static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; + static const int PacketSize = PacketType<CoeffReturnType, Device>::size; typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions; + typedef StorageMemory<CoeffReturnType, Device> Storage; + typedef typename Storage::Type EvaluatorPointerType; + + static const int NumDims = internal::array_size<Dimensions>::value; + + //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// + typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + + typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock + IfArgTensorBlock; + typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock + ThenArgTensorBlock; + typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock + ElseArgTensorBlock; + + struct TensorSelectOpBlockFactory { + template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> + struct XprType { + typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type; + }; + + template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType> + typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr( + const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { + return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr); + } + }; + + typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory, + IfArgTensorBlock, ThenArgTensorBlock, + ElseArgTensorBlock> + TensorBlock; + //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -578,13 +885,26 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + +#ifdef EIGEN_USE_THREADS + template <typename EvalSubExprsCallback> + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EvaluatorPointerType, EvalSubExprsCallback done) { + m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { + m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); + }); + }); + } +#endif // EIGEN_USE_THREADS + + EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); m_elseImpl.cleanup(); @@ -597,13 +917,15 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> template<int LoadMode> EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector<PacketSize> select; - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet<LoadMode>(index), - m_elseImpl.template packet<LoadMode>(index)); + internal::Selector<PacketSize> select; + EIGEN_UNROLL_LOOP + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet<LoadMode>(index), + m_elseImpl.template packet<LoadMode>(index)); + } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -613,14 +935,42 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType> .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; } - /// required by sycl in order to extract the accessor - const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + internal::TensorBlockResourceRequirements getResourceRequirements() const { + auto then_req = m_thenImpl.getResourceRequirements(); + auto else_req = m_elseImpl.getResourceRequirements(); + + auto merged_req = + internal::TensorBlockResourceRequirements::merge(then_req, else_req); + merged_req.cost_per_coeff = + then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); + + return internal::TensorBlockResourceRequirements::merge( + m_condImpl.getResourceRequirements(), merged_req); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock + block(TensorBlockDesc& desc, TensorBlockScratch& scratch, + bool /*root_of_expr_ast*/ = false) const { + // It's unsafe to pass destination buffer to underlying expressions, because + // output might be aliased with one of the inputs. + desc.DropDestinationBuffer(); + + return TensorBlock( + m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), + m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } + +#ifdef EIGEN_USE_SYCL + // binding placeholder accessors to a command group handler for SYCL + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { + m_condImpl.bind(cgh); + m_thenImpl.bind(cgh); + m_elseImpl.bind(cgh); + } +#endif private: TensorEvaluator<IfArgType, Device> m_condImpl; TensorEvaluator<ThenArgType, Device> m_thenImpl; |