diff options
Diffstat (limited to 'Eigen/src/Core/products')
23 files changed, 2831 insertions, 2320 deletions
diff --git a/Eigen/src/Core/products/CMakeLists.txt b/Eigen/src/Core/products/CMakeLists.txt deleted file mode 100644 index 21fc94ae3..000000000 --- a/Eigen/src/Core/products/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Core_Product_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Core_Product_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/products COMPONENT Devel - ) diff --git a/Eigen/src/Core/products/CoeffBasedProduct.h b/Eigen/src/Core/products/CoeffBasedProduct.h deleted file mode 100644 index 2a9d65b94..000000000 --- a/Eigen/src/Core/products/CoeffBasedProduct.h +++ /dev/null @@ -1,476 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> -// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COEFFBASED_PRODUCT_H -#define EIGEN_COEFFBASED_PRODUCT_H - -namespace Eigen { - -namespace internal { - -/********************************************************************************* -* Coefficient based product implementation. -* It is designed for the following use cases: -* - small fixed sizes -* - lazy products -*********************************************************************************/ - -/* Since the all the dimensions of the product are small, here we can rely - * on the generic Assign mechanism to evaluate the product per coeff (or packet). - * - * Note that here the inner-loops should always be unrolled. - */ - -template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl; - -template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl; - -template<typename LhsNested, typename RhsNested, int NestingFlags> -struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> > -{ - typedef MatrixXpr XprKind; - typedef typename remove_all<LhsNested>::type _LhsNested; - typedef typename remove_all<RhsNested>::type _RhsNested; - typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar; - typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind, - typename traits<_RhsNested>::StorageKind>::ret StorageKind; - typedef typename promote_index_type<typename traits<_LhsNested>::Index, - typename traits<_RhsNested>::Index>::type Index; - - enum { - LhsCoeffReadCost = _LhsNested::CoeffReadCost, - RhsCoeffReadCost = _RhsNested::CoeffReadCost, - LhsFlags = _LhsNested::Flags, - RhsFlags = _RhsNested::Flags, - - RowsAtCompileTime = _LhsNested::RowsAtCompileTime, - ColsAtCompileTime = _RhsNested::ColsAtCompileTime, - InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime), - - MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime, - MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime, - - LhsRowMajor = LhsFlags & RowMajorBit, - RhsRowMajor = RhsFlags & RowMajorBit, - - SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value, - - CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit) - && (ColsAtCompileTime == Dynamic - || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0 - && (RhsFlags&AlignedBit) - ) - ), - - CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) - && (RowsAtCompileTime == Dynamic - || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0 - && (LhsFlags&AlignedBit) - ) - ), - - EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 - : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 - : (RhsRowMajor && !CanVectorizeLhs), - - Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit) - | (EvalToRowMajor ? RowMajorBit : 0) - | NestingFlags - | (LhsFlags & RhsFlags & AlignedBit) - // TODO enable vectorization for mixed types - | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0), - - CoeffReadCost = InnerSize == Dynamic ? Dynamic - : InnerSize == 0 ? 0 - : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost) - + (InnerSize - 1) * NumTraits<Scalar>::AddCost, - - /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside - * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner - * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect - * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI. - */ - CanVectorizeInner = SameType - && LhsRowMajor - && (!RhsRowMajor) - && (LhsFlags & RhsFlags & ActualPacketAccessBit) - && (LhsFlags & RhsFlags & AlignedBit) - && (InnerSize % packet_traits<Scalar>::size == 0) - }; -}; - -} // end namespace internal - -template<typename LhsNested, typename RhsNested, int NestingFlags> -class CoeffBasedProduct - : internal::no_assignment_operator, - public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> > -{ - public: - - typedef MatrixBase<CoeffBasedProduct> Base; - EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct) - typedef typename Base::PlainObject PlainObject; - - private: - - typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested; - typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested; - - enum { - PacketSize = internal::packet_traits<Scalar>::size, - InnerSize = internal::traits<CoeffBasedProduct>::InnerSize, - Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT, - CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner - }; - - typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal, - Unroll ? InnerSize : Dynamic, - _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl; - - typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType; - - public: - - inline CoeffBasedProduct(const CoeffBasedProduct& other) - : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs) - {} - - template<typename Lhs, typename Rhs> - inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs) - : m_lhs(lhs), m_rhs(rhs) - { - // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable. - // We still allow to mix T and complex<T>. - EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined), - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) - eigen_assert(lhs.cols() == rhs.rows() - && "invalid matrix product" - && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); - } - - EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } - EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } - - EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const - { - Scalar res; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - /* Allow index-based non-packet access. It is impossible though to allow index-based packed access, - * which is why we don't set the LinearAccessBit. - */ - EIGEN_STRONG_INLINE const Scalar coeff(Index index) const - { - Scalar res; - const Index row = RowsAtCompileTime == 1 ? 0 : index; - const Index col = RowsAtCompileTime == 1 ? index : 0; - ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res); - return res; - } - - template<int LoadMode> - EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const - { - PacketScalar res; - internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor, - Unroll ? InnerSize : Dynamic, - _LhsNested, _RhsNested, PacketScalar, LoadMode> - ::run(row, col, m_lhs, m_rhs, res); - return res; - } - - // Implicit conversion to the nested type (trigger the evaluation of the product) - EIGEN_STRONG_INLINE operator const PlainObject& () const - { - m_result.lazyAssign(*this); - return m_result; - } - - const _LhsNested& lhs() const { return m_lhs; } - const _RhsNested& rhs() const { return m_rhs; } - - const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const - { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); } - - template<int DiagonalIndex> - const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const - { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); } - - const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const - { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); } - - protected: - typename internal::add_const_on_value_type<LhsNested>::type m_lhs; - typename internal::add_const_on_value_type<RhsNested>::type m_rhs; - - mutable PlainObject m_result; -}; - -namespace internal { - -// here we need to overload the nested rule for products -// such that the nested type is a const reference to a plain matrix -template<typename Lhs, typename Rhs, int N, typename PlainObject> -struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject> -{ - typedef PlainObject const& type; -}; - -/*************************************************************************** -* Normal product .coeff() implementation (with meta-unrolling) -***************************************************************************/ - -/************************************** -*** Scalar path - no vectorization *** -**************************************/ - -template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res); - res += lhs.coeff(row, UnrollingIndex-1) * rhs.coeff(UnrollingIndex-1, col); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<DefaultTraversal, 1, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - res = lhs.coeff(row, 0) * rhs.coeff(0, col); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res) - { - res = RetScalar(0); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res) - { - res = (lhs.row(row).transpose().cwiseProduct( rhs.col(col) )).sum(); - } -}; - -/******************************************* -*** Scalar path with inner vectorization *** -*******************************************/ - -template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet> -struct product_coeff_vectorized_unroller -{ - typedef typename Lhs::Index Index; - enum { PacketSize = packet_traits<typename Lhs::Scalar>::size }; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); - pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) )); - } -}; - -template<typename Lhs, typename Rhs, typename Packet> -struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres) - { - pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col)); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<InnerVectorizedTraversal, 0, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res) - { - res = 0; - } -}; - -template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::PacketScalar Packet; - typedef typename Lhs::Index Index; - enum { PacketSize = packet_traits<typename Lhs::Scalar>::size }; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res) - { - Packet pres; - product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres); - res = predux(pres); - } -}; - -template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime> -struct product_coeff_vectorized_dyn_selector -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum(); - } -}; - -// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower -// NOTE maybe they are now useless since we have a specialization for Block<Matrix> -template<typename Lhs, typename Rhs, int RhsCols> -struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = lhs.transpose().cwiseProduct(rhs.col(col)).sum(); - } -}; - -template<typename Lhs, typename Rhs, int LhsRows> -struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = lhs.row(row).transpose().cwiseProduct(rhs).sum(); - } -}; - -template<typename Lhs, typename Rhs> -struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - res = lhs.transpose().cwiseProduct(rhs).sum(); - } -}; - -template<typename Lhs, typename Rhs, typename RetScalar> -struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res) - { - product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res); - } -}; - -/******************* -*** Packet path *** -*******************/ - -template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) - { - product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); - res = pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res); - } -}; - -template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) - { - product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res); - res = pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) - { - res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col)); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res) - { - res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col))); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res) - { - res = pset1<Packet>(0); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res) - { - res = pset1<Packet>(0); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) - { - res = pset1<Packet>(0); - for(Index i = 0; i < lhs.cols(); ++i) - res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res); - } -}; - -template<typename Lhs, typename Rhs, typename Packet, int LoadMode> -struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> -{ - typedef typename Lhs::Index Index; - static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res) - { - res = pset1<Packet>(0); - for(Index i = 0; i < lhs.cols(); ++i) - res = pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COEFFBASED_PRODUCT_H diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index bcdca5b0d..45230bce5 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -10,8 +10,9 @@ #ifndef EIGEN_GENERAL_BLOCK_PANEL_H #define EIGEN_GENERAL_BLOCK_PANEL_H -namespace Eigen { - + +namespace Eigen { + namespace internal { template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> @@ -24,29 +25,51 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } +#if EIGEN_ARCH_i386_OR_x86_64 +const std::ptrdiff_t defaultL1CacheSize = 32*1024; +const std::ptrdiff_t defaultL2CacheSize = 256*1024; +const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +#else +const std::ptrdiff_t defaultL1CacheSize = 16*1024; +const std::ptrdiff_t defaultL2CacheSize = 512*1024; +const std::ptrdiff_t defaultL3CacheSize = 512*1024; +#endif + /** \internal */ -inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0) -{ - static std::ptrdiff_t m_l1CacheSize = 0; - static std::ptrdiff_t m_l2CacheSize = 0; - if(m_l2CacheSize==0) - { - m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024); - m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024); +struct CacheSizes { + CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { + int l1CacheSize, l2CacheSize, l3CacheSize; + queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize); + m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize); + m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize); + m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize); } - + + std::ptrdiff_t m_l1; + std::ptrdiff_t m_l2; + std::ptrdiff_t m_l3; +}; + + +/** \internal */ +inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) +{ + static CacheSizes m_cacheSizes; + if(action==SetAction) { // set the cpu cache size and cache all block sizes from a global cache size in byte eigen_internal_assert(l1!=0 && l2!=0); - m_l1CacheSize = *l1; - m_l2CacheSize = *l2; + m_cacheSizes.m_l1 = *l1; + m_cacheSizes.m_l2 = *l2; + m_cacheSizes.m_l3 = *l3; } else if(action==GetAction) { eigen_internal_assert(l1!=0 && l2!=0); - *l1 = m_l1CacheSize; - *l2 = m_l2CacheSize; + *l1 = m_cacheSizes.m_l1; + *l2 = m_cacheSizes.m_l2; + *l3 = m_cacheSizes.m_l3; } else { @@ -54,6 +77,206 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi } } +/* Helper for computeProductBlockingSizes. + * + * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, + * this function computes the blocking size parameters along the respective dimensions + * for matrix products and related algorithms. The blocking sizes depends on various + * parameters: + * - the L1 and L2 cache sizes, + * - the register level blocking sizes defined by gebp_traits, + * - the number of scalars that fit into a packet (when vectorization is enabled). + * + * \sa setCpuCacheSizes */ + +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + typedef gebp_traits<LhsScalar,RhsScalar> Traits; + + // Explanations: + // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and + // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed + // per mr x kc horizontal small panels where mr is the blocking size along the m dimension + // at the register level. This small horizontal panel has to stay within L1 cache. + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + + if (num_threads > 1) { + typedef typename Traits::ResScalar ResScalar; + enum { + kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + ksub = Traits::mr * Traits::nr * sizeof(ResScalar), + kr = 8, + mr = Traits::mr, + nr = Traits::nr + }; + // Increasing k gives us more time to prefetch the content of the "C" + // registers. However once the latency is hidden there is no point in + // increasing the value of k, so we'll cap it at 320 (value determined + // experimentally). + const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320); + if (k_cache < k) { + k = k_cache - (k_cache % kr); + eigen_internal_assert(k > 0); + } + + const Index n_cache = (l2-l1) / (nr * sizeof(RhsScalar) * k); + const Index n_per_thread = numext::div_ceil(n, num_threads); + if (n_cache <= n_per_thread) { + // Don't exceed the capacity of the l2 cache. + eigen_internal_assert(n_cache >= static_cast<Index>(nr)); + n = n_cache - (n_cache % nr); + eigen_internal_assert(n > 0); + } else { + n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr)); + } + + if (l3 > l2) { + // l3 is shared between all cores, so we'll give each thread its own chunk of l3. + const Index m_cache = (l3-l2) / (sizeof(LhsScalar) * k * num_threads); + const Index m_per_thread = numext::div_ceil(m, num_threads); + if(m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) { + m = m_cache - (m_cache % mr); + eigen_internal_assert(m > 0); + } else { + m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr)); + } + } + } + else { + // In unit tests we do not want to use extra large matrices, + // so we reduce the cache size to check the blocking strategy is not flawed +#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + l1 = 9*1024; + l2 = 32*1024; + l3 = 512*1024; +#endif + + // Early return for small problems because the computation below are time consuming for small problems. + // Perhaps it would make more sense to consider k*n*m?? + // Note that for very tiny problem, this function should be bypassed anyway + // because we use the coefficient-based implementation for them. + if((numext::maxi)(k,(numext::maxi)(m,n))<48) + return; + + typedef typename Traits::ResScalar ResScalar; + enum { + k_peeling = 8, + k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)), + k_sub = Traits::mr * Traits::nr * sizeof(ResScalar) + }; + + // ---- 1st level of blocking on L1, yields kc ---- + + // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel + // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache. + // We also include a register-level block of the result (mx x nr). + // (In an ideal world only the lhs panel would stay in L1) + // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of: + const Index max_kc = numext::maxi<Index>(((l1-k_sub)/k_div) & (~(k_peeling-1)),1); + const Index old_k = k; + if(k>max_kc) + { + // We are really blocking on the third dimension: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the result. + k = (k%max_kc)==0 ? max_kc + : max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1))); + + eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same"); + } + + // ---- 2nd level of blocking on max(L2,L3), yields nc ---- + + // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is: + // actual_l2 = max(l2, l3/nb_core_sharing_l3) + // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it) + // For instance, it corresponds to 6MB of L3 shared among 4 cores. + #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS + const Index actual_l2 = l3; + #else + const Index actual_l2 = 1572864; // == 1.5 MB + #endif + + // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2. + // The second half is implicitly reserved to access the result and lhs coefficients. + // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful + // to limit this growth: we bound nc to growth by a factor x1.5. + // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all, + // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space. + Index max_nc; + const Index lhs_bytes = m * k * sizeof(LhsScalar); + const Index remaining_l1 = l1- k_sub - lhs_bytes; + if(remaining_l1 >= Index(Traits::nr*sizeof(RhsScalar))*k) + { + // L1 blocking + max_nc = remaining_l1 / (k*sizeof(RhsScalar)); + } + else + { + // L2 blocking + max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar)); + } + // WARNING Below, we assume that Traits::nr is a power of two. + Index nc = numext::mini<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1)); + if(n>nc) + { + // We are really blocking over the columns: + // -> reduce blocking size to make sure the last block is as large as possible + // while keeping the same number of sweeps over the packed lhs. + // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1" + n = (n%nc)==0 ? nc + : (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1)))); + } + else if(old_k==k) + { + // So far, no blocking at all, i.e., kc==k, and nc==n. + // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2 + // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic here should be obsolete. + Index problem_size = k*n*sizeof(LhsScalar); + Index actual_lm = actual_l2; + Index max_mc = m; + if(problem_size<=1024) + { + // problem is small enough to keep in L1 + // Let's choose m such that lhs's block fit in 1/3 of L1 + actual_lm = l1; + } + else if(l3!=0 && problem_size<=32768) + { + // we have both L2 and L3, and problem is small enough to be kept in L2 + // Let's choose m such that lhs's block fit in 1/3 of L2 + actual_lm = l2; + max_mc = (numext::mini<Index>)(576,max_mc); + } + Index mc = (numext::mini<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc); + if (mc > Traits::mr) mc -= mc % Traits::mr; + else if (mc==0) return; + m = (m%mc)==0 ? mc + : (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1)))); + } + } +} + +template <typename Index> +inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) +{ +#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES + if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) { + k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K); + m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M); + n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N); + return true; + } +#else + EIGEN_UNUSED_VARIABLE(k) + EIGEN_UNUSED_VARIABLE(m) + EIGEN_UNUSED_VARIABLE(n) +#endif + return false; +} + /** \brief Computes the blocking parameters for a m x k times k x n matrix product * * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension. @@ -62,48 +285,30 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi * * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar, * this function computes the blocking size parameters along the respective dimensions - * for matrix products and related algorithms. The blocking sizes depends on various - * parameters: - * - the L1 and L2 cache sizes, - * - the register level blocking sizes defined by gebp_traits, - * - the number of scalars that fit into a packet (when vectorization is enabled). + * for matrix products and related algorithms. + * + * The blocking size parameters may be evaluated: + * - either by a heuristic based on cache sizes; + * - or using fixed prescribed values (for testing purposes). * * \sa setCpuCacheSizes */ -template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType> -void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) -{ - EIGEN_UNUSED_VARIABLE(n); - // Explanations: - // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and - // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed - // per kc x nr vertical small panels where nr is the blocking size along the n dimension - // at the register level. For vectorization purpose, these small vertical panels are unpacked, - // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to - // stay in L1 cache. - std::ptrdiff_t l1, l2; - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - enum { - kdiv = KcFactor * 2 * Traits::nr - * Traits::RhsProgress * sizeof(RhsScalar), - mr = gebp_traits<LhsScalar,RhsScalar>::mr, - mr_mask = (0xffffffff/mr)*mr - }; - - manage_caching_sizes(GetAction, &l1, &l2); - k = std::min<SizeType>(k, l1/kdiv); - SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0; - if(_m<m) m = _m & mr_mask; +template<typename LhsScalar, typename RhsScalar, int KcFactor, typename Index> +void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) +{ + if (!useSpecificBlockingSizes(k, m, n)) { + evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads); + } } -template<typename LhsScalar, typename RhsScalar, typename SizeType> -inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) +template<typename LhsScalar, typename RhsScalar, typename Index> +inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) { - computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n); + computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads); } -#ifdef EIGEN_HAS_FUSE_CJMADD - #define MADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD + #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); #else // FIXME (a bit overkill maybe ?) @@ -128,8 +333,8 @@ inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n) gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); } - #define MADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define MADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); + #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); +// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif /* Vectorization logic @@ -148,7 +353,7 @@ class gebp_traits public: typedef _LhsScalar LhsScalar; typedef _RhsScalar RhsScalar; - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { ConjLhs = _ConjLhs, @@ -160,16 +365,22 @@ public: NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // register block size along the N direction (must be either 2 or 4) - nr = NumberOfRegisters/4, + // register block size along the N direction must be 1 or 4 + nr = 4, // register block size along the M direction (currently, this one cannot be modified) - mr = 2 * LhsPacketSize, + default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + // we assume 16 registers + // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, + // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + mr = Vectorizable ? 3*LhsPacketSize : default_mr, +#else + mr = default_mr, +#endif - WorkSpaceFactor = nr * RhsPacketSize, - LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -186,36 +397,67 @@ public: { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + { + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = ploadquad<RhsPacket>(b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload<RhsPacket>(b); + dest = pload<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = pload<LhsPacket>(a); + dest = ploadu<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const { - tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp); + conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; + // It would be a lot cleaner to call pmadd all the time. Unfortunately if we + // let gcc allocate the register in which to store the result of the pmul + // (in the case where there is no FMA) gcc fails to figure out how to avoid + // spilling register. +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c = cj.pmadd(a,b,c); +#else + tmp = b; tmp = cj.pmul(a,tmp); c = padd(c,tmp); +#endif } EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); } + + template<typename ResPacketHalf> + EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const + { + r = pmadd(c,alpha,r); + } -protected: -// conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; -// conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj; }; template<typename RealScalar, bool _ConjLhs> @@ -224,7 +466,7 @@ class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> public: typedef std::complex<RealScalar> LhsScalar; typedef RealScalar RhsScalar; - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { ConjLhs = _ConjLhs, @@ -235,12 +477,16 @@ public: ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - nr = NumberOfRegisters/4, - mr = 2 * LhsPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + nr = 4, +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + // we assume 16 registers + mr = 3*LhsPacketSize, +#else + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, +#endif LhsProgress = LhsPacketSize, - RhsProgress = RhsPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -258,15 +504,14 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = pset1<RhsPacket>(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pload<RhsPacket>(b); + dest = pset1<RhsPacket>(*b); } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const @@ -274,6 +519,21 @@ public: dest = pload<LhsPacket>(a); } + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu<LhsPacket>(a); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); @@ -281,7 +541,12 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a.v,b,c.v); +#else tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp); +#endif } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -298,6 +563,38 @@ protected: conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; }; +template<typename Packet> +struct DoublePacket +{ + Packet first; + Packet second; +}; + +template<typename Packet> +DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +{ + DoublePacket<Packet> res; + res.first = padd(a.first, b.first); + res.second = padd(a.second,b.second); + return res; +} + +template<typename Packet> +const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a) +{ + return a; +} + +template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; }; +// template<typename Packet> +// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) +// { +// DoublePacket<Packet> res; +// res.first = padd(a.first, b.first); +// res.second = padd(a.second,b.second); +// return res; +// } + template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > { @@ -314,60 +611,80 @@ public: && packet_traits<Scalar>::Vectorizable, RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, - - nr = 2, - mr = 2 * ResPacketSize, - WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr, + LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, + RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = Vectorizable ? 2*ResPacketSize : 1 + RhsProgress = 1 }; typedef typename packet_traits<RealScalar>::type RealPacket; typedef typename packet_traits<Scalar>::type ScalarPacket; - struct DoublePacket - { - RealPacket first; - RealPacket second; - }; + typedef DoublePacket<RealPacket> DoublePacketType; typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; - typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; - typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket; + typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - EIGEN_STRONG_INLINE void initAcc(DoublePacket& p) + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { p.first = pset1<RealPacket>(RealScalar(0)); p.second = pset1<RealPacket>(RealScalar(0)); } - /* Unpack the rhs coeff such that each complex coefficient is spread into - * two packects containing respectively the real and imaginary coefficient - * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y] - */ - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b) + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - { - if(Vectorizable) - { - pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0], real(rhs[k])); - pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k])); - } - else - b[k] = rhs[k]; - } + dest = pset1<ResPacket>(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + // Vectorized path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + { + dest.first = pset1<RealPacket>(real(*b)); + dest.second = pset1<RealPacket>(imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + { + loadRhs(b,dest); + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + { + eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - dest.first = pload<RealPacket>((const RealScalar*)b); - dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize)); + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); } // nothing special here @@ -376,7 +693,12 @@ public: dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); @@ -389,7 +711,7 @@ public: EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c ResPacket tmp; @@ -440,12 +762,12 @@ public: ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters nr = 4, - mr = 2*ResPacketSize, - WorkSpaceFactor = nr*RhsPacketSize, + mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*ResPacketSize, LhsProgress = ResPacketSize, - RhsProgress = ResPacketSize + RhsProgress = 1 }; typedef typename packet_traits<LhsScalar>::type _LhsPacket; @@ -463,21 +785,38 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b) + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - for(DenseIndex k=0; k<n; k++) - pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]); + dest = pset1<RhsPacket>(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - dest = pload<RhsPacket>(b); + pbroadcast4(b, b0, b1, b2, b3); } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1<RhsPacket>(b+0); +// b1 = pload1<RhsPacket>(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const + { + eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + { + dest = ploaddup<LhsPacket>(a); + } EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { @@ -486,7 +825,13 @@ public: EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { +#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD + EIGEN_UNUSED_VARIABLE(tmp); + c.v = pmadd(a,b.v,c.v); +#else tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp); +#endif + } EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const @@ -510,7 +855,7 @@ protected: * |real |cplx | no vectorization yet, would require to pack A with duplication * |cplx |real | easy vectorization */ -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; @@ -520,6 +865,15 @@ struct gebp_kernel typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; + typedef typename SwappedTraits::ResScalar SResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + typedef typename DataMapper::LinearMapper LinearMapper; + enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, @@ -528,571 +882,788 @@ struct gebp_kernel }; EIGEN_DONT_INLINE - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0); + void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE -void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs> - ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha, - Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB) +void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> + ::operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, + Index rows, Index depth, Index cols, ResScalar alpha, + Index strideA, Index strideB, Index offsetA, Index offsetB) { Traits traits; + SwappedTraits straits; if(strideA==-1) strideA = depth; if(strideB==-1) strideB = depth; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; -// conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; - Index packet_cols = (cols/nr) * nr; - const Index peeled_mc = (rows/mr)*mr; - // FIXME: - const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0); - const Index peeled_kc = (depth/4)*4; - - if(unpackedB==0) - unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress); - - // loops on each micro vertical panel of rhs (depth x nr) - for(Index j2=0; j2<packet_cols; j2+=nr) + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; + const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) + const Index peeled_kc = depth & ~(pk-1); + const Index prefetch_res_offset = 32/sizeof(ResScalar); +// const Index depth2 = depth & ~1; + + //---------- Process 3 * LhsProgress rows at once ---------- + // This corresponds to 3*LhsProgress x nr register blocks. + // Usually, make sense only with FMA + if(mr>=3*Traits::LhsProgress) { - traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); - - // loops on each largest micro horizontal panel of lhs (mr x depth) - // => we select a mr x nr micro block of res which is entirely - // stored into mr/packet_size x nr registers. - for(Index i=0; i<peeled_mc; i+=mr) + // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x depth) + // and on each largest micro vertical panel of the rhs (depth * nr). + // Blocking sizes, i.e., 'depth' has been computed so that the micro horizontal panel of the lhs fit in L1. + // However, if depth is too small, we can extend the number of rows of these horizontal panels. + // This actual number of rows is computed as follow: + const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. + // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size + // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), + // or because we are testing specific blocking sizes. + const Index actual_panel_rows = (3*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 3*LhsProgress) )); + for(Index i1=0; i1<peeled_mc3; i1+=actual_panel_rows) { - const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3, C4, C5, C6, C7; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - traits.initAcc(C4); - traits.initAcc(C5); - if(nr==4) traits.initAcc(C6); - if(nr==4) traits.initAcc(C7); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; - - prefetch(r0+16); - prefetch(r1+16); - prefetch(r2+16); - prefetch(r3+16); - - // performs "inner" product - // TODO let's check wether the folowing peeled loop could not be - // optimized via optimal prefetching from one loop to the other - const RhsScalar* blB = unpackedB; - for(Index k=0; k<peeled_kc; k+=4) + const Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc3); + for(Index j2=0; j2<packet_cols4; j2+=nr) { - if(nr==2) - { - LhsPacket A0, A1; - RhsPacket B_0; - RhsPacket T0; - -EIGEN_ASM_COMMENT("mybegin2"); - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[1*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadLhs(&blA[3*LhsProgress], A1); - traits.loadRhs(&blB[2*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[4*LhsProgress], A0); - traits.loadLhs(&blA[5*LhsProgress], A1); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[5*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); - - traits.loadLhs(&blA[6*LhsProgress], A0); - traits.loadLhs(&blA[7*LhsProgress], A1); - traits.loadRhs(&blB[6*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[7*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); -EIGEN_ASM_COMMENT("myend"); - } - else + for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) { -EIGEN_ASM_COMMENT("mybegin4"); - LhsPacket A0, A1; - RhsPacket B_0, B1, B2, B3; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,T0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[6*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[3*LhsProgress], A1); - traits.loadRhs(&blB[7*RhsProgress], B3); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[8*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[9*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[10*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[4*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[5*LhsProgress], A1); - traits.loadRhs(&blB[11*RhsProgress], B3); - - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[12*RhsProgress], B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.loadRhs(&blB[13*RhsProgress], B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.loadRhs(&blB[14*RhsProgress], B2); - traits.madd(A0,B3,C3,T0); - traits.loadLhs(&blA[6*LhsProgress], A0); - traits.madd(A1,B3,C7,B3); - traits.loadLhs(&blA[7*LhsProgress], A1); - traits.loadRhs(&blB[15*RhsProgress], B3); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.madd(A0,B3,C3,T0); - traits.madd(A1,B3,C7,B3); - } + + // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 3 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7, + C8, C9, C10, C11; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + traits.initAcc(C8); traits.initAcc(C9); traits.initAcc(C10); traits.initAcc(C11); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(0); + r1.prefetch(0); + r2.prefetch(0); + r3.prefetch(0); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; - blB += 4*nr*RhsProgress; - blA += 4*mr; - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - if(nr==2) + for(Index k=0; k<peeled_kc; k+=pk) { - LhsPacket A0, A1; - RhsPacket B_0; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[1*RhsProgress], B_0); - traits.madd(A0,B_0,C1,T0); - traits.madd(A1,B_0,C5,B_0); + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); + RhsPacket B_0, T0; + LhsPacket A2; + +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + internal::prefetch(blA+(3*K+16)*LhsProgress); \ + if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, T0); \ + traits.madd(A2, B_0, C8, B_0); \ + traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C1, T0); \ + traits.madd(A1, B_0, C5, T0); \ + traits.madd(A2, B_0, C9, B_0); \ + traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C2, T0); \ + traits.madd(A1, B_0, C6, T0); \ + traits.madd(A2, B_0, C10, B_0); \ + traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C3 , T0); \ + traits.madd(A1, B_0, C7, T0); \ + traits.madd(A2, B_0, C11, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while(false) + + internal::prefetch(blB); + EIGEN_GEBP_ONESTEP(0); + EIGEN_GEBP_ONESTEP(1); + EIGEN_GEBP_ONESTEP(2); + EIGEN_GEBP_ONESTEP(3); + EIGEN_GEBP_ONESTEP(4); + EIGEN_GEBP_ONESTEP(5); + EIGEN_GEBP_ONESTEP(6); + EIGEN_GEBP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4"); } - else + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsPacket A0, A1; - RhsPacket B_0, B1, B2, B3; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,T0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.madd(A1,B_0,C4,B_0); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.madd(A0,B1,C1,T0); - traits.madd(A1,B1,C5,B1); - traits.madd(A0,B2,C2,T0); - traits.madd(A1,B2,C6,B2); - traits.madd(A0,B3,C3,T0); - traits.madd(A1,B3,C7,B3); + RhsPacket B_0, T0; + LhsPacket A2; + EIGEN_GEBP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 3*Traits::LhsProgress; } - blB += nr*RhsProgress; - blA += mr; - } +#undef EIGEN_GEBP_ONESTEP - if(nr==4) - { - ResPacket R0, R1, R2, R3, R4, R5, R6; + ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - R2 = ploadu<ResPacket>(r2); - R3 = ploadu<ResPacket>(r3); - R4 = ploadu<ResPacket>(r0 + ResPacketSize); - R5 = ploadu<ResPacket>(r1 + ResPacketSize); - R6 = ploadu<ResPacket>(r2 + ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu<ResPacket>(r3 + ResPacketSize); - - traits.acc(C1, alphav, R1); - traits.acc(C2, alphav, R2); - traits.acc(C3, alphav, R3); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R5); - traits.acc(C6, alphav, R6); - traits.acc(C7, alphav, R0); - - pstoreu(r1, R1); - pstoreu(r2, R2); - pstoreu(r3, R3); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R5); - pstoreu(r2 + ResPacketSize, R6); - pstoreu(r3 + ResPacketSize, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C1, alphav, R0); + traits.acc(C5, alphav, R1); + traits.acc(C9, alphav, R2); + r1.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C10, alphav, R2); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r2.storePacket(2 * Traits::ResPacketSize, R2); + + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); + traits.acc(C3, alphav, R0); + traits.acc(C7, alphav, R1); + traits.acc(C11, alphav, R2); + r3.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(2 * Traits::ResPacketSize, R2); + } } - else + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) { - ResPacket R0, R1, R4; + for(Index i=i1; i<actual_panel_end; i+=3*LhsProgress) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(3*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C4, C8; + traits.initAcc(C0); + traits.initAcc(C4); + traits.initAcc(C8); + + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(0); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1, A2; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); + RhsPacket B_0; +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A2, B_0, C8, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while(false) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*3*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 3*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - R4 = ploadu<ResPacket>(r0 + ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); - pstoreu(r0, R0); - R0 = ploadu<ResPacket>(r1 + ResPacketSize); - traits.acc(C1, alphav, R1); - traits.acc(C4, alphav, R4); - traits.acc(C5, alphav, R0); - pstoreu(r1, R1); - pstoreu(r0 + ResPacketSize, R4); - pstoreu(r1 + ResPacketSize, R0); + traits.acc(C4, alphav, R1); + traits.acc(C8, alphav, R2); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r0.storePacket(2 * Traits::ResPacketSize, R2); + } } - } - - if(rows-peeled_mc>=LhsProgress) + } + + //---------- Process 2 * LhsProgress rows at once ---------- + if(mr>=2*Traits::LhsProgress) + { + const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function. + // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size + // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only guess), + // or because we are testing specific blocking sizes. + Index actual_panel_rows = (2*LhsProgress) * std::max<Index>(1,( (l1 - sizeof(ResScalar)*mr*nr - depth*nr*sizeof(RhsScalar)) / (depth * sizeof(LhsScalar) * 2*LhsProgress) )); + + for(Index i1=peeled_mc3; i1<peeled_mc2; i1+=actual_panel_rows) { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - if(nr==4) traits.initAcc(C2); - if(nr==4) traits.initAcc(C3); - - // performs "inner" product - const RhsScalar* blB = unpackedB; - for(Index k=0; k<peeled_kc; k+=4) + Index actual_panel_end = (std::min)(i1+actual_panel_rows, peeled_mc2); + for(Index j2=0; j2<packet_cols4; j2+=nr) { - if(nr==2) + for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) { - LhsPacket A0; - RhsPacket B_0, B1; + + // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 2 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3, + C4, C5, C6, C7; + traits.initAcc(C0); traits.initAcc(C1); traits.initAcc(C2); traits.initAcc(C3); + traits.initAcc(C4); traits.initAcc(C5); traits.initAcc(C6); traits.initAcc(C7); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[2*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[1*LhsProgress], A0); - traits.loadRhs(&blB[3*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[6*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadLhs(&blA[3*LhsProgress], A0); - traits.loadRhs(&blB[7*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - } - else + for(Index k=0; k<peeled_kc; k+=pk) { - LhsPacket A0; - RhsPacket B_0, B1, B2, B3; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.loadRhs(&blB[3*RhsProgress], B3); - traits.loadRhs(&blB[4*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[5*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[6*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - traits.loadLhs(&blA[1*LhsProgress], A0); - traits.loadRhs(&blB[7*RhsProgress], B3); - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[8*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[9*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[10*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - traits.loadLhs(&blA[2*LhsProgress], A0); - traits.loadRhs(&blB[11*RhsProgress], B3); - - traits.madd(A0,B_0,C0,B_0); - traits.loadRhs(&blB[12*RhsProgress], B_0); - traits.madd(A0,B1,C1,B1); - traits.loadRhs(&blB[13*RhsProgress], B1); - traits.madd(A0,B2,C2,B2); - traits.loadRhs(&blB[14*RhsProgress], B2); - traits.madd(A0,B3,C3,B3); - - traits.loadLhs(&blA[3*LhsProgress], A0); - traits.loadRhs(&blB[15*RhsProgress], B3); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - traits.madd(A0,B2,C2,B2); - traits.madd(A0,B3,C3,B3); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); + RhsPacket B_0, B1, B2, B3, T0; + + #define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B1, C1, T0); \ + traits.madd(A1, B1, C5, B1); \ + traits.madd(A0, B2, C2, T0); \ + traits.madd(A1, B2, C6, B2); \ + traits.madd(A0, B3, C3, T0); \ + traits.madd(A1, B3, C7, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while(false) + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*(2*Traits::LhsProgress); + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4"); } - - blB += nr*4*RhsProgress; - blA += 4*LhsProgress; - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - if(nr==2) + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsPacket A0; - RhsPacket B_0, B1; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); + RhsPacket B_0, B1, B2, B3, T0; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 2*Traits::LhsProgress; } - else - { - LhsPacket A0; - RhsPacket B_0, B1, B2, B3; +#undef EIGEN_GEBGP_ONESTEP - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.loadRhs(&blB[1*RhsProgress], B1); - traits.loadRhs(&blB[2*RhsProgress], B2); - traits.loadRhs(&blB[3*RhsProgress], B3); + ResPacket R0, R1, R2, R3; + ResPacket alphav = pset1<ResPacket>(alpha); - traits.madd(A0,B_0,C0,B_0); - traits.madd(A0,B1,C1,B1); - traits.madd(A0,B2,C2,B2); - traits.madd(A0,B3,C3,B3); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + traits.acc(C1, alphav, R2); + traits.acc(C5, alphav, R3); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + r1.storePacket(0 * Traits::ResPacketSize, R2); + r1.storePacket(1 * Traits::ResPacketSize, R3); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C6, alphav, R1); + traits.acc(C3, alphav, R2); + traits.acc(C7, alphav, R3); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r2.storePacket(1 * Traits::ResPacketSize, R1); + r3.storePacket(0 * Traits::ResPacketSize, R2); + r3.storePacket(1 * Traits::ResPacketSize, R3); } - - blB += nr*RhsProgress; - blA += LhsProgress; } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + for(Index i=i1; i<actual_panel_end; i+=2*LhsProgress) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(2*Traits::LhsProgress)]; + prefetch(&blA[0]); - ResPacket R0, R1, R2, R3; - ResPacket alphav = pset1<ResPacket>(alpha); - - ResScalar* r0 = &res[(j2+0)*resStride + i]; - ResScalar* r1 = r0 + resStride; - ResScalar* r2 = r1 + resStride; - ResScalar* r3 = r2 + resStride; + // gets res block as register + AccPacket C0, C4; + traits.initAcc(C0); + traits.initAcc(C4); - R0 = ploadu<ResPacket>(r0); - R1 = ploadu<ResPacket>(r1); - if(nr==4) R2 = ploadu<ResPacket>(r2); - if(nr==4) R3 = ploadu<ResPacket>(r3); + LinearMapper r0 = res.getLinearMapper(i, j2); + r0.prefetch(prefetch_res_offset); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - if(nr==4) traits.acc(C2, alphav, R2); - if(nr==4) traits.acc(C3, alphav, R3); + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0, A1; - pstoreu(r0, R0); - pstoreu(r1, R1); - if(nr==4) pstoreu(r2, R2); - if(nr==4) pstoreu(r3, R3); - } - for(Index i=peeled_mc2; i<rows; i++) - { - const LhsScalar* blA = &blockA[i*strideA+offsetA]; - prefetch(&blA[0]); - - // gets a 1 x nr res block as registers - ResScalar C0(0), C1(0), C2(0), C3(0); - // TODO directly use blockB ??? - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - for(Index k=0; k<depth; k++) - { - if(nr==2) + for(Index k=0; k<peeled_kc; k+=pk) { - LhsScalar A0; - RhsScalar B_0, B1; - - A0 = blA[k]; - B_0 = blB[0]; - B1 = blB[1]; - MADD(cj,A0,B_0,C0,B_0); - MADD(cj,A0,B1,C1,B1); + EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1"); + RhsPacket B_0, B1; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B1); \ + traits.madd(A1, B_0, C4, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ + } while(false) + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*2*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } - else + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) { - LhsScalar A0; - RhsScalar B_0, B1, B2, B3; - - A0 = blA[k]; - B_0 = blB[0]; - B1 = blB[1]; - B2 = blB[2]; - B3 = blB[3]; - - MADD(cj,A0,B_0,C0,B_0); - MADD(cj,A0,B1,C1,B1); - MADD(cj,A0,B2,C2,B2); - MADD(cj,A0,B3,C3,B3); + RhsPacket B_0, B1; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 2*Traits::LhsProgress; } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); - blB += nr; + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C4, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r0.storePacket(1 * Traits::ResPacketSize, R1); + } } - res[(j2+0)*resStride + i] += alpha*C0; - res[(j2+1)*resStride + i] += alpha*C1; - if(nr==4) res[(j2+2)*resStride + i] += alpha*C2; - if(nr==4) res[(j2+3)*resStride + i] += alpha*C3; } } - // process remaining rhs/res columns one at a time - // => do the same but with nr==1 - for(Index j2=packet_cols; j2<cols; j2++) + //---------- Process 1 * LhsProgress rows at once ---------- + if(mr>=1*Traits::LhsProgress) { - // unpack B - traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB); - - for(Index i=0; i<peeled_mc; i+=mr) + // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) + for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) { - const LhsScalar* blA = &blockA[i*strideA+offsetA*mr]; - prefetch(&blA[0]); + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely + // stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); + RhsPacket B_0, B1, B2, B3; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A0, B1, C1, B1); \ + traits.madd(A0, B2, C2, B2); \ + traits.madd(A0, B3, C3, B3); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ + } while(false) + + internal::prefetch(blB+(48+0)); + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + internal::prefetch(blB+(48+16)); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*4*RhsProgress; + blA += pk*1*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); + } + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0, B1, B2, B3; + EIGEN_GEBGP_ONESTEP(0); + blB += 4*RhsProgress; + blA += 1*LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP - // TODO move the res loads to the stores + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); - // get res block as registers - AccPacket C0, C4; - traits.initAcc(C0); - traits.initAcc(C4); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); + } - const RhsScalar* blB = unpackedB; - for(Index k=0; k<depth; k++) + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) { - LhsPacket A0, A1; - RhsPacket B_0; - RhsPacket T0; - - traits.loadLhs(&blA[0*LhsProgress], A0); - traits.loadLhs(&blA[1*LhsProgress], A1); - traits.loadRhs(&blB[0*RhsProgress], B_0); - traits.madd(A0,B_0,C0,T0); - traits.madd(A1,B_0,C4,B_0); + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; + prefetch(&blA[0]); - blB += RhsProgress; - blA += 2*LhsProgress; - } - ResPacket R0, R4; - ResPacket alphav = pset1<ResPacket>(alpha); + // gets res block as register + AccPacket C0; + traits.initAcc(C0); - ResScalar* r0 = &res[(j2+0)*resStride + i]; + LinearMapper r0 = res.getLinearMapper(i, j2); - R0 = ploadu<ResPacket>(r0); - R4 = ploadu<ResPacket>(r0+ResPacketSize); + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; - traits.acc(C0, alphav, R0); - traits.acc(C4, alphav, R4); + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*1*Traits::LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1"); + } - pstoreu(r0, R0); - pstoreu(r0+ResPacketSize, R4); + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += 1*Traits::LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); + } } - if(rows-peeled_mc>=LhsProgress) + } + //---------- Process remaining rows, 1 at once ---------- + if(peeled_mc1<rows) + { + // loop on each panel of the rhs + for(Index j2=0; j2<packet_cols4; j2+=nr) { - Index i = peeled_mc; - const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress]; - prefetch(&blA[0]); - - AccPacket C0; - traits.initAcc(C0); - - const RhsScalar* blB = unpackedB; - for(Index k=0; k<depth; k++) + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc1; i<rows; i+=1) { - LhsPacket A0; - RhsPacket B_0; - traits.loadLhs(blA, A0); - traits.loadRhs(blB, B_0); - traits.madd(A0, B_0, C0, B_0); - blB += RhsProgress; - blA += LhsProgress; + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + + // The following piece of code wont work for 512 bit registers + // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size + // as nr (which is currently 4) for the return type. + typedef typename unpacket_traits<SResPacket>::half SResPacketHalf; + if ((SwappedTraits::LhsProgress % 4) == 0 && + (SwappedTraits::LhsProgress <= 8) && + (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr)) + { + SAccPacket C0, C1, C2, C3; + straits.initAcc(C0); + straits.initAcc(C1); + straits.initAcc(C2); + straits.initAcc(C3); + + const Index spk = (std::max)(1,SwappedTraits::LhsProgress/4); + const Index endk = (depth/spk)*spk; + const Index endk4 = (depth/(spk*4))*(spk*4); + + Index k=0; + for(; k<endk4; k+=4*spk) + { + SLhsPacket A0,A1; + SRhsPacket B_0,B_1; + + straits.loadLhsUnaligned(blB+0*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+1*SwappedTraits::LhsProgress, A1); + + straits.loadRhsQuad(blA+0*spk, B_0); + straits.loadRhsQuad(blA+1*spk, B_1); + straits.madd(A0,B_0,C0,B_0); + straits.madd(A1,B_1,C1,B_1); + + straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); + straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); + straits.loadRhsQuad(blA+2*spk, B_0); + straits.loadRhsQuad(blA+3*spk, B_1); + straits.madd(A0,B_0,C2,B_0); + straits.madd(A1,B_1,C3,B_1); + + blB += 4*SwappedTraits::LhsProgress; + blA += 4*spk; + } + C0 = padd(padd(C0,C1),padd(C2,C3)); + for(; k<endk; k+=spk) + { + SLhsPacket A0; + SRhsPacket B_0; + + straits.loadLhsUnaligned(blB, A0); + straits.loadRhsQuad(blA, B_0); + straits.madd(A0,B_0,C0,B_0); + + blB += SwappedTraits::LhsProgress; + blA += spk; + } + if(SwappedTraits::LhsProgress==8) + { + // Special case where we have to first reduce the accumulation register C0 + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; + + SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); + SResPacketHalf alphav = pset1<SResPacketHalf>(alpha); + + if(depth-endk>0) + { + // We have to handle the last row of the rhs which corresponds to a half-packet + SLhsPacketHalf a0; + SRhsPacketHalf b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + SAccPacketHalf c0 = predux_downto4(C0); + straits.madd(a0,b0,c0,b0); + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_downto4(C0), alphav, R); + } + res.scatterPacket(i, j2, R); + } + else + { + SResPacket R = res.template gatherPacket<SResPacket>(i, j2); + SResPacket alphav = pset1<SResPacket>(alpha); + straits.acc(C0, alphav, R); + res.scatterPacket(i, j2, R); + } + } + else // scalar path + { + // get a 1 x 4 res block as registers + ResScalar C0(0), C1(0), C2(0), C3(0); + + for(Index k=0; k<depth; k++) + { + LhsScalar A0; + RhsScalar B_0, B_1; + + A0 = blA[k]; + + B_0 = blB[0]; + B_1 = blB[1]; + CJMADD(cj,A0,B_0,C0, B_0); + CJMADD(cj,A0,B_1,C1, B_1); + + B_0 = blB[2]; + B_1 = blB[3]; + CJMADD(cj,A0,B_0,C2, B_0); + CJMADD(cj,A0,B_1,C3, B_1); + + blB += 4; + } + res(i, j2 + 0) += alpha * C0; + res(i, j2 + 1) += alpha * C1; + res(i, j2 + 2) += alpha * C2; + res(i, j2 + 3) += alpha * C3; + } } - - ResPacket alphav = pset1<ResPacket>(alpha); - ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]); - traits.acc(C0, alphav, R0); - pstoreu(&res[(j2+0)*resStride + i], R0); } - for(Index i=peeled_mc2; i<rows; i++) + // remaining columns + for(Index j2=packet_cols4; j2<cols; j2++) { - const LhsScalar* blA = &blockA[i*strideA+offsetA]; - prefetch(&blA[0]); - - // gets a 1 x 1 res block as registers - ResScalar C0(0); - // FIXME directly use blockB ?? - const RhsScalar* blB = &blockB[j2*strideB+offsetB]; - for(Index k=0; k<depth; k++) + // loop on each row of the lhs (1*LhsProgress x depth) + for(Index i=peeled_mc1; i<rows; i+=1) { - LhsScalar A0 = blA[k]; - RhsScalar B_0 = blB[k]; - MADD(cj, A0, B_0, C0, B_0); + const LhsScalar* blA = &blockA[i*strideA+offsetA]; + prefetch(&blA[0]); + // gets a 1 x 1 res block as registers + ResScalar C0(0); + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + for(Index k=0; k<depth; k++) + { + LhsScalar A0 = blA[k]; + RhsScalar B_0 = blB[k]; + CJMADD(cj, A0, B_0, C0, B_0); + } + res(i, j2) += alpha * C0; } - res[(j2+0)*resStride + i] += alpha*C0; } } } @@ -1114,81 +1685,193 @@ EIGEN_ASM_COMMENT("mybegin4"); // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> { - EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0); + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode> - ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { typedef typename packet_traits<Scalar>::type Packet; enum { PacketSize = packet_traits<Scalar>::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); - EIGEN_UNUSED_VARIABLE(stride) - EIGEN_UNUSED_VARIABLE(offset) + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) ); + eigen_assert( ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) || (Pack1<=4) ); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i<peeled_mc; i+=Pack1) + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + + Index i=0; + + // Pack 3 packets + if(Pack1>=3*PacketSize) { - if(PanelMode) count += Pack1 * offset; + for(; i<peeled_mc3; i+=3*PacketSize) + { + if(PanelMode) count += (3*PacketSize) * offset; - if(StorageOrder==ColMajor) + for(Index k=0; k<depth; k++) + { + Packet A, B, C; + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; + pstore(blockA+count, cj.pconj(C)); count+=PacketSize; + } + if(PanelMode) count += (3*PacketSize) * (stride-offset-depth); + } + } + // Pack 2 packets + if(Pack1>=2*PacketSize) + { + for(; i<peeled_mc2; i+=2*PacketSize) { + if(PanelMode) count += (2*PacketSize) * offset; + for(Index k=0; k<depth; k++) { - Packet A, B, C, D; - if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k)); - if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k)); - if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k)); - if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k)); - if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } - if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } - if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; } - if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; } + Packet A, B; + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); count+=PacketSize; + pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } + if(PanelMode) count += (2*PacketSize) * (stride-offset-depth); } - else + } + // Pack 1 packets + if(Pack1>=1*PacketSize) + { + for(; i<peeled_mc1; i+=1*PacketSize) { + if(PanelMode) count += (1*PacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + Packet A; + A = lhs.loadPacket(i+0*PacketSize, k); + pstore(blockA+count, cj.pconj(A)); + count+=PacketSize; + } + if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); + } + } + // Pack scalars + if(Pack2<PacketSize && Pack2>1) + { + for(; i<peeled_mc0; i+=Pack2) + { + if(PanelMode) count += Pack2 * offset; + for(Index k=0; k<depth; k++) + for(Index w=0; w<Pack2; w++) + blockA[count++] = cj(lhs(i+w, k)); + + if(PanelMode) count += Pack2 * (stride-offset-depth); + } + } + for(; i<rows; i++) + { + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + blockA[count++] = cj(lhs(i, k)); + if(PanelMode) count += (stride-offset-depth); + } +} + +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +{ + typedef typename DataMapper::LinearMapper LinearMapper; + EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); +}; + +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) +{ + typedef typename packet_traits<Scalar>::type Packet; + enum { PacketSize = packet_traits<Scalar>::size }; + + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index count = 0; + +// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; +// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; +// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + int pack = Pack1; + Index i = 0; + while(pack>0) + { + Index remaining_rows = rows-i; + Index peeled_mc = i+(remaining_rows/pack)*pack; + for(; i<peeled_mc; i+=pack) + { + if(PanelMode) count += pack * offset; + + const Index peeled_k = (depth/PacketSize)*PacketSize; + Index k=0; + if(pack>=PacketSize) + { + for(; k<peeled_k; k+=PacketSize) + { + for (Index m = 0; m < pack; m += PacketSize) + { + PacketBlock<Packet> kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } + count += PacketSize*pack; + } + } + for(; k<depth; k++) { - // TODO add a vectorized transpose here Index w=0; - for(; w<Pack1-3; w+=4) + for(; w<pack-3; w+=4) { Scalar a(cj(lhs(i+w+0, k))), - b(cj(lhs(i+w+1, k))), - c(cj(lhs(i+w+2, k))), - d(cj(lhs(i+w+3, k))); + b(cj(lhs(i+w+1, k))), + c(cj(lhs(i+w+2, k))), + d(cj(lhs(i+w+3, k))); blockA[count++] = a; blockA[count++] = b; blockA[count++] = c; blockA[count++] = d; } - if(Pack1%4) - for(;w<Pack1;++w) + if(pack%4) + for(;w<pack;++w) blockA[count++] = cj(lhs(i+w, k)); } + + if(PanelMode) count += pack * (stride-offset-depth); } - if(PanelMode) count += Pack1 * (stride-offset-depth); - } - if(rows-peeled_mc>=Pack2) - { - if(PanelMode) count += Pack2*offset; - for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) - blockA[count++] = cj(lhs(peeled_mc+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); - peeled_mc += Pack2; + + pack -= PacketSize; + if(pack<Pack2 && (pack+PacketSize)!=Pack2) + pack = Pack2; } - for(Index i=peeled_mc; i<rows; i++) + + for(; i<rows; i++) { if(PanelMode) count += offset; for(Index k=0; k<depth; k++) @@ -1204,53 +1887,123 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, // 4 5 6 7 16 17 18 19 25 28 // 8 9 10 11 20 21 22 23 26 29 // . . . . . . . . . . -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR"); - EIGEN_UNUSED_VARIABLE(stride) - EIGEN_UNUSED_VARIABLE(offset) + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + const Index peeled_k = (depth/PacketSize)*PacketSize; +// if(nr>=8) +// { +// for(Index j2=0; j2<packet_cols8; j2+=8) +// { +// // skip what we have before +// if(PanelMode) count += 8 * offset; +// const Scalar* b0 = &rhs[(j2+0)*rhsStride]; +// const Scalar* b1 = &rhs[(j2+1)*rhsStride]; +// const Scalar* b2 = &rhs[(j2+2)*rhsStride]; +// const Scalar* b3 = &rhs[(j2+3)*rhsStride]; +// const Scalar* b4 = &rhs[(j2+4)*rhsStride]; +// const Scalar* b5 = &rhs[(j2+5)*rhsStride]; +// const Scalar* b6 = &rhs[(j2+6)*rhsStride]; +// const Scalar* b7 = &rhs[(j2+7)*rhsStride]; +// Index k=0; +// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 +// { +// for(; k<peeled_k; k+=PacketSize) { +// PacketBlock<Packet> kernel; +// for (int p = 0; p < PacketSize; ++p) { +// kernel.packet[p] = ploadu<Packet>(&rhs[(j2+p)*rhsStride+k]); +// } +// ptranspose(kernel); +// for (int p = 0; p < PacketSize; ++p) { +// pstoreu(blockB+count, cj.pconj(kernel.packet[p])); +// count+=PacketSize; +// } +// } +// } +// for(; k<depth; k++) +// { +// blockB[count+0] = cj(b0[k]); +// blockB[count+1] = cj(b1[k]); +// blockB[count+2] = cj(b2[k]); +// blockB[count+3] = cj(b3[k]); +// blockB[count+4] = cj(b4[k]); +// blockB[count+5] = cj(b5[k]); +// blockB[count+6] = cj(b6[k]); +// blockB[count+7] = cj(b7[k]); +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + + if(nr>=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; - const Scalar* b1 = &rhs[(j2+1)*rhsStride]; - const Scalar* b2 = &rhs[(j2+2)*rhsStride]; - const Scalar* b3 = &rhs[(j2+3)*rhsStride]; - for(Index k=0; k<depth; k++) + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { - blockB[count+0] = cj(b0[k]); - blockB[count+1] = cj(b1[k]); - if(nr==4) blockB[count+2] = cj(b2[k]); - if(nr==4) blockB[count+3] = cj(b3[k]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0); + const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1); + const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2); + const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3); + + Index k=0; + if((PacketSize%4)==0) // TODO enable vectorized transposition for PacketSize==2 ?? + { + for(; k<peeled_k; k+=PacketSize) { + PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1%PacketSize] = dm1.loadPacket(k); + kernel.packet[2%PacketSize] = dm2.loadPacket(k); + kernel.packet[3%PacketSize] = dm3.loadPacket(k); + ptranspose(kernel); + pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); + pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); + pstoreu(blockB+count+2*PacketSize, cj.pconj(kernel.packet[2%PacketSize])); + pstoreu(blockB+count+3*PacketSize, cj.pconj(kernel.packet[3%PacketSize])); + count+=4*PacketSize; + } + } + for(; k<depth; k++) + { + blockB[count+0] = cj(dm0(k)); + blockB[count+1] = cj(dm1(k)); + blockB[count+2] = cj(dm2(k)); + blockB[count+3] = cj(dm3(k)); + count += 4; + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[(j2+0)*rhsStride]; + const LinearMapper dm0 = rhs.getLinearMapper(0, j2); for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k]); + blockB[count] = cj(dm0(k)); count += 1; } if(PanelMode) count += (stride-offset-depth); @@ -1258,48 +2011,93 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, Pan } // this version is optimized for row major matrices -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { + typedef typename packet_traits<Scalar>::type Packet; + typedef typename DataMapper::LinearMapper LinearMapper; enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0); + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset) +template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> + ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) { EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - EIGEN_UNUSED_VARIABLE(stride) - EIGEN_UNUSED_VARIABLE(offset) + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols = (cols/nr) * nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; Index count = 0; - for(Index j2=0; j2<packet_cols; j2+=nr) + +// if(nr>=8) +// { +// for(Index j2=0; j2<packet_cols8; j2+=8) +// { +// // skip what we have before +// if(PanelMode) count += 8 * offset; +// for(Index k=0; k<depth; k++) +// { +// if (PacketSize==8) { +// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); +// pstoreu(blockB+count, cj.pconj(A)); +// } else if (PacketSize==4) { +// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); +// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); +// pstoreu(blockB+count, cj.pconj(A)); +// pstoreu(blockB+count+PacketSize, cj.pconj(B)); +// } else { +// const Scalar* b0 = &rhs[k*rhsStride + j2]; +// blockB[count+0] = cj(b0[0]); +// blockB[count+1] = cj(b0[1]); +// blockB[count+2] = cj(b0[2]); +// blockB[count+3] = cj(b0[3]); +// blockB[count+4] = cj(b0[4]); +// blockB[count+5] = cj(b0[5]); +// blockB[count+6] = cj(b0[6]); +// blockB[count+7] = cj(b0[7]); +// } +// count += 8; +// } +// // skip what we have after +// if(PanelMode) count += 8 * (stride-offset-depth); +// } +// } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += nr * offset; - for(Index k=0; k<depth; k++) + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { - const Scalar* b0 = &rhs[k*rhsStride + j2]; - blockB[count+0] = cj(b0[0]); - blockB[count+1] = cj(b0[1]); - if(nr==4) blockB[count+2] = cj(b0[2]); - if(nr==4) blockB[count+3] = cj(b0[3]); - count += nr; + // skip what we have before + if(PanelMode) count += 4 * offset; + for(Index k=0; k<depth; k++) + { + if (PacketSize==4) { + Packet A = rhs.loadPacket(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } + } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += nr * (stride-offset-depth); } // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { if(PanelMode) count += offset; - const Scalar* b0 = &rhs[j2]; for(Index k=0; k<depth; k++) { - blockB[count] = cj(b0[k*rhsStride]); + blockB[count] = cj(rhs(k, j2)); count += 1; } if(PanelMode) count += stride-offset-depth; @@ -1312,8 +2110,8 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, Pan * \sa setCpuCacheSize */ inline std::ptrdiff_t l1CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l1; } @@ -1321,19 +2119,29 @@ inline std::ptrdiff_t l1CacheSize() * \sa setCpuCacheSize */ inline std::ptrdiff_t l2CacheSize() { - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); return l2; } +/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size paramete\ +rs. +* \sa setCpuCacheSize */ +inline std::ptrdiff_t l3CacheSize() +{ + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); + return l3; +} + /** Set the cpu L1 and L2 cache sizes (in bytes). * These values are use to adjust the size of the blocks * for the algorithms working per blocks. * * \sa computeProductBlockingSizes */ -inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2) +inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) { - internal::manage_caching_sizes(SetAction, &l1, &l2); + internal::manage_caching_sizes(SetAction, &l1, &l2, &l3); } } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 3f5ffcf51..6440e1d09 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H #define EIGEN_GENERAL_MATRIX_MATRIX_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -23,7 +23,9 @@ template< typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> { - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef gebp_traits<RhsScalar,LhsScalar> Traits; + + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const LhsScalar* lhs, Index lhsStride, @@ -51,42 +53,44 @@ template< struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> { -typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; +typedef gebp_traits<LhsScalar,RhsScalar> Traits; + +typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* _res, Index resStride, ResScalar alpha, level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - - typedef gebp_traits<LhsScalar,RhsScalar> Traits; + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction - //Index nc = blocking.nc(); // cache block size along the N direction + Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; #ifdef EIGEN_HAS_OPENMP if(info) { // this is the parallel version! - Index tid = omp_get_thread_num(); - Index threads = omp_get_num_threads(); - - std::size_t sizeA = kc*mc; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0); - - RhsScalar* blockB = blocking.blockB(); - eigen_internal_assert(blockB!=0); + int tid = omp_get_thread_num(); + int threads = omp_get_num_threads(); + + LhsScalar* blockA = blocking.blockA(); + eigen_internal_assert(blockA!=0); + + std::size_t sizeB = kc*nc; + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0); // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs... for(Index k=0; k<depth; k+=kc) @@ -94,54 +98,56 @@ static void run(Index rows, Index cols, Index depth, const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A' // In order to reduce the chance that a thread has to wait for the other, - // let's start by packing A'. - pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc); + // let's start by packing B'. + pack_rhs(blockB, rhs.getSubMapper(k,0), actual_kc, nc); - // Pack B_k to B' in a parallel fashion: - // each thread packs the sub block B_k,j to B'_j where j is the thread id. + // Pack A_k to A' in a parallel fashion: + // each thread packs the sub block A_k,i to A'_i where i is the thread id. - // However, before copying to B'_j, we have to make sure that no other thread is still using it, + // However, before copying to A'_i, we have to make sure that no other thread is still using it, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} info[tid].users += threads; - pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length); + pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); - // Notify the other threads that the part B'_j is ready to go. + // Notify the other threads that the part A'_i is ready to go. info[tid].sync = k; - // Computes C_i += A' * B' per B'_j - for(Index shift=0; shift<threads; ++shift) + // Computes C_i += A' * B' per A'_i + for(int shift=0; shift<threads; ++shift) { - Index j = (tid+shift)%threads; + int i = (tid+shift)%threads; - // At this point we have to make sure that B'_j has been updated by the thread j, + // At this point we have to make sure that A'_i has been updated by the thread i, // we use testAndSetOrdered to mimic a volatile access. // However, no need to wait for the B' part which has been updated by the current thread! - if(shift>0) - while(info[j].sync!=k) {} + if (shift>0) { + while(info[i].sync!=k) { + } + } - gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w); + gebp(res.getSubMapper(info[i].lhs_start, 0), blockA+info[i].lhs_start*actual_kc, blockB, info[i].lhs_length, actual_kc, nc, alpha); } - // Then keep going as usual with the remaining A' - for(Index i=mc; i<rows; i+=mc) + // Then keep going as usual with the remaining B' + for(Index j=nc; j<cols; j+=nc) { - const Index actual_mc = (std::min)(i+mc,rows)-i; + const Index actual_nc = (std::min)(j+nc,cols)-j; - // pack A_i,k to A' - pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc); + // pack B_k,j to B' + pack_rhs(blockB, rhs.getSubMapper(k,j), actual_kc, actual_nc); - // C_i += A' * B' - gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w); + // C_j += A' * B' + gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha); } - // Release all the sub blocks B'_j of B' for the current thread, + // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 - for(Index j=0; j<threads; ++j) + for(Index i=0; i<threads; ++i) #pragma omp atomic - --(info[j].users); + info[i].users -= 1; } } else @@ -151,38 +157,42 @@ static void run(Index rows, Index cols, Index depth, // this is the sequential version! std::size_t sizeA = kc*mc; - std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = kc*nc; ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW()); + + const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols; // For each horizontal panel of the rhs, and corresponding panel of the lhs... - // (==GEMM_VAR1) - for(Index k2=0; k2<depth; k2+=kc) + for(Index i2=0; i2<rows; i2+=mc) { - const Index actual_kc = (std::min)(k2+kc,depth)-k2; + const Index actual_mc = (std::min)(i2+mc,rows)-i2; - // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. - // => Pack rhs's panel into a sequential chunk of memory (L2 caching) - // Note that this panel will be read as many times as the number of blocks in the lhs's - // vertical panel which is, in practice, a very low number. - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); - - // For each mc x kc block of the lhs's vertical panel... - // (==GEPP_VAR1) - for(Index i2=0; i2<rows; i2+=mc) + for(Index k2=0; k2<depth; k2+=kc) { - const Index actual_mc = (std::min)(i2+mc,rows)-i2; - - // We pack the lhs's block into a sequential chunk of memory (L1 caching) - // Note that this block will be read a very high number of times, which is equal to the number of - // micro vertical panel of the large rhs's panel (e.g., cols/4 times). - pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc); - - // Everything is packed, we can now call the block * panel kernel: - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + const Index actual_kc = (std::min)(k2+kc,depth)-k2; + + // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs. + // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching) + // Note that this panel will be read as many times as the number of blocks in the rhs's + // horizontal panel which is, in practice, a very low number. + pack_lhs(blockA, lhs.getSubMapper(i2,k2), actual_kc, actual_mc); + + // For each kc x nc block of the rhs's horizontal panel... + for(Index j2=0; j2<cols; j2+=nc) + { + const Index actual_nc = (std::min)(j2+nc,cols)-j2; + + // We pack the rhs's block into a sequential chunk of memory (L2 caching) + // Note that this block will be read a very high number of times, which is equal to the number of + // micro horizontal panel of the large rhs's panel (e.g., rows/12 times). + if((!pack_rhs_once) || i2==0) + pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc); + + // Everything is packed, we can now call the panel * block kernel: + gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha); + } } } } @@ -191,26 +201,21 @@ static void run(Index rows, Index cols, Index depth, }; /********************************************************************************* -* Specialization of GeneralProduct<> for "large" GEMM, i.e., +* Specialization of generic_product_impl for "large" GEMM, i.e., * implementation of the high level wrapper to general_matrix_matrix_product **********************************************************************************/ -template<typename Lhs, typename Rhs> -struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> > - : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> > -{}; - template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType> struct gemm_functor { - gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, - BlockingType& blocking) + gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking) : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {} - void initParallelSession() const + void initParallelSession(Index num_threads) const { - m_blocking.allocateB(); + m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads); + m_blocking.allocateA(); } void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const @@ -219,12 +224,14 @@ struct gemm_functor cols = m_rhs.cols(); Gemm::run(rows, cols, m_lhs.cols(), - /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(), - /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(), + &m_lhs.coeffRef(row,0), m_lhs.outerStride(), + &m_rhs.coeffRef(0,col), m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } + typedef typename Gemm::Traits Traits; + protected: const Lhs& m_lhs; const Rhs& m_rhs; @@ -245,29 +252,27 @@ class level3_blocking protected: LhsScalar* m_blockA; RhsScalar* m_blockB; - RhsScalar* m_blockW; - DenseIndex m_mc; - DenseIndex m_nc; - DenseIndex m_kc; + Index m_mc; + Index m_nc; + Index m_kc; public: level3_blocking() - : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0) + : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {} - inline DenseIndex mc() const { return m_mc; } - inline DenseIndex nc() const { return m_nc; } - inline DenseIndex kc() const { return m_kc; } + inline Index mc() const { return m_mc; } + inline Index nc() const { return m_nc; } + inline Index kc() const { return m_kc; } inline LhsScalar* blockA() { return m_blockA; } inline RhsScalar* blockB() { return m_blockB; } - inline RhsScalar* blockW() { return m_blockW; } }; template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor> -class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true> +class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true /* == FiniteAtCompileTime */> : public level3_blocking< typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type, typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type> @@ -282,29 +287,38 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M typedef gebp_traits<LhsScalar,RhsScalar> Traits; enum { SizeA = ActualRows * MaxDepth, - SizeB = ActualCols * MaxDepth, - SizeW = MaxDepth * Traits::WorkSpaceFactor + SizeB = ActualCols * MaxDepth }; - EIGEN_ALIGN16 LhsScalar m_staticA[SizeA]; - EIGEN_ALIGN16 RhsScalar m_staticB[SizeB]; - EIGEN_ALIGN16 RhsScalar m_staticW[SizeW]; +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES + EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA]; + EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB]; +#else + EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; + EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES-1]; +#endif public: - gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/) + gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/, bool /*full_rows = false*/) { this->m_mc = ActualRows; this->m_nc = ActualCols; this->m_kc = MaxDepth; +#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES this->m_blockA = m_staticA; this->m_blockB = m_staticB; - this->m_blockW = m_staticW; +#else + this->m_blockA = reinterpret_cast<LhsScalar*>((internal::UIntPtr(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); + this->m_blockB = reinterpret_cast<RhsScalar*>((internal::UIntPtr(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES-1)) & ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1)); +#endif } + void initParallel(Index, Index, Index, Index) + {} + inline void allocateA() {} inline void allocateB() {} - inline void allocateW() {} inline void allocateAll() {} }; @@ -321,22 +335,42 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar; typedef gebp_traits<LhsScalar,RhsScalar> Traits; - DenseIndex m_sizeA; - DenseIndex m_sizeB; - DenseIndex m_sizeW; + Index m_sizeA; + Index m_sizeB; public: - gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth) + gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) { this->m_mc = Transpose ? cols : rows; this->m_nc = Transpose ? rows : cols; this->m_kc = depth; - computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc); + if(l3_blocking) + { + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads); + } + else // no l3 blocking + { + Index n = this->m_nc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, n, num_threads); + } + + m_sizeA = this->m_mc * this->m_kc; + m_sizeB = this->m_kc * this->m_nc; + } + + void initParallel(Index rows, Index cols, Index depth, Index num_threads) + { + this->m_mc = Transpose ? cols : rows; + this->m_nc = Transpose ? rows : cols; + this->m_kc = depth; + + eigen_internal_assert(this->m_blockA==0 && this->m_blockB==0); + Index m = this->m_mc; + computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, m, this->m_nc, num_threads); m_sizeA = this->m_mc * this->m_kc; m_sizeB = this->m_kc * this->m_nc; - m_sizeW = this->m_kc*Traits::WorkSpaceFactor; } void allocateA() @@ -351,77 +385,108 @@ class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, M this->m_blockB = aligned_new<RhsScalar>(m_sizeB); } - void allocateW() - { - if(this->m_blockW==0) - this->m_blockW = aligned_new<RhsScalar>(m_sizeW); - } - void allocateAll() { allocateA(); allocateB(); - allocateW(); } ~gemm_blocking_space() { aligned_delete(this->m_blockA, m_sizeA); aligned_delete(this->m_blockB, m_sizeB); - aligned_delete(this->m_blockW, m_sizeW); } }; } // end namespace internal +namespace internal { + template<typename Lhs, typename Rhs> -class GeneralProduct<Lhs, Rhs, GemmProduct> - : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> +struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> + : generic_product_impl_base<Lhs,Rhs,generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> > { - enum { - MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) - }; - public: - EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct) - - typedef typename Lhs::Scalar LhsScalar; - typedef typename Rhs::Scalar RhsScalar; - typedef Scalar ResScalar; + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; - GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) - { - typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp; - EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar); - } + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; - template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const - { - eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; - typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); - typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); + enum { + MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime) + }; - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); + typedef generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode> lazyproduct; - typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, - Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + template<typename Dst> + static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::evalTo(dst, lhs, rhs); + else + { + dst.setZero(); + scaleAndAddTo(dst, lhs, rhs, Scalar(1)); + } + } - typedef internal::gemm_functor< - Scalar, Index, - internal::general_matrix_matrix_product< - Index, - LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), - RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, - _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor; + template<typename Dst> + static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::addTo(dst, lhs, rhs); + else + scaleAndAddTo(dst,lhs, rhs, Scalar(1)); + } - BlockingType blocking(dst.rows(), dst.cols(), lhs.cols()); + template<typename Dst> + static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + { + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) + lazyproduct::subTo(dst, lhs, rhs); + else + scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); + } - internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit); - } + template<typename Dest> + static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) + { + eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); + if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) + return; + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); + + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, + Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; + + typedef internal::gemm_functor< + Scalar, Index, + internal::general_matrix_matrix_product< + Index, + LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), + RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; + + BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); + internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)> + (GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(), Dest::Flags&RowMajorBit); + } }; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_GENERAL_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5c3763909..7122efa60 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -20,7 +20,7 @@ namespace internal { /********************************************************************** * This file implements a general A * B product while * evaluating only one triangular part of the product. -* This is more general version of self adjoint product (C += A A^T) +* This is a more general version of self adjoint product (C += A A^T) * as the level 3 SYRK Blas routine. **********************************************************************/ @@ -40,15 +40,16 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version> { - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) { general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor, UpLo==Lower?Upper:Lower> - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha); + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); } }; @@ -56,32 +57,36 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version> { - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) { - const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<LhsScalar,RhsScalar> Traits; - Index kc = depth; // cache block size along the K direction - Index mc = size; // cache block size along the M direction - Index nc = size; // cache block size along the N direction - computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc); + typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + + Index kc = blocking.kc(); + Index mc = (std::min)(size,blocking.mc()); + // !!! mc must be a multiple of nr: if(mc > Traits::nr) mc = (mc/Traits::nr)*Traits::nr; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*size; - ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0); - RhsScalar* blockB = allocatedBlockB + sizeW; - - gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs; - gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*size; + + ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); + + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) @@ -89,29 +94,30 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, const Index actual_kc = (std::min)(k2+kc,depth)-k2; // note that the actual rhs is the transpose/adjoint of mat - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, size); for(Index i2=0; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); // the selected actual_mc * size panel of res is split into three different part: // 1 - before the diagonal => processed with gebp or skipped // 2 - the actual_mc x actual_mc symmetric block => processed with a special kernel // 3 - after the diagonal => processed with gebp or skipped if (UpLo==Lower) - gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, - -1, -1, 0, 0, allocatedBlockB); + gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, + (std::min)(size,i2), alpha, -1, -1, 0, 0); + - sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB); + sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { Index j2 = i2+actual_mc; - gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha, - -1, -1, 0, 0, allocatedBlockB); + gebp(res.getSubMapper(i2, j2), blockA, blockB+actual_kc*j2, actual_mc, + actual_kc, (std::max)(Index(0), size-j2), alpha, -1, -1, 0, 0); } } } @@ -132,14 +138,17 @@ struct tribb_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; typedef typename Traits::ResScalar ResScalar; - + enum { - BlockSize = EIGEN_PLAIN_ENUM_MAX(mr,nr) + BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret }; - void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace) + void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel; - Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer; + typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper; + ResMapper res(_res, resStride); + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + + Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert())); // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. @@ -149,20 +158,20 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) { - ResScalar* r = res + (j+j1)*resStride + i; + ResScalar* r = &res(i, j + j1); for(Index i1=UpLo==Lower ? j1 : 0; UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) r[i1] += buffer(i1,j1); @@ -172,8 +181,8 @@ struct tribb_kernel if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha, - -1, -1, 0, 0, workspace); + gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, + depth, actualBlockSize, alpha, -1, -1, 0, 0); } } } @@ -190,10 +199,9 @@ struct general_product_to_triangular_selector; template<typename MatrixType, typename ProductType, int UpLo> struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true> { - static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha) + static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) { typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::Index Index; typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs; typedef internal::blas_traits<Lhs> LhsBlasTraits; @@ -209,6 +217,9 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true> Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + if(!beta) + mat.template triangularView<UpLo>().setZero(); + enum { StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor, UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1, @@ -236,10 +247,8 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true> template<typename MatrixType, typename ProductType, int UpLo> struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false> { - static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha) + static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) { - typedef typename MatrixType::Index Index; - typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs; typedef internal::blas_traits<Lhs> LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs; @@ -254,23 +263,42 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false> typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived()); + if(!beta) + mat.template triangularView<UpLo>().setZero(); + + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualLhs.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,typename Lhs::Scalar,typename Rhs::Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualRhs::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product<Index, - typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, - typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualLhs.cols(), + typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; template<typename MatrixType, unsigned int UpLo> -template<typename ProductDerived, typename _Lhs, typename _Rhs> -TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha) +template<typename ProductType> +TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { - general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha); + eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); + + general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - return *this; + return derived(); } } // end namespace Eigen diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 3deed068e..5b7c15cca 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -25,15 +25,15 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Level 3 BLAS SYRK/HERK implementation. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -44,34 +44,35 @@ struct general_matrix_matrix_rankupdate : // try to go to BLAS specialization -#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar) \ template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ { \ if (lhs==rhs) { \ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product<Index, \ Scalar, LhsStorageOrder, ConjugateLhs, \ Scalar, RhsStorageOrder, ConjugateRhs, \ ColMajor, UpLo, BuiltIn> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } \ } \ }; -EIGEN_MKL_RANKUPDATE_SPECIALIZE(double) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex) -EIGEN_MKL_RANKUPDATE_SPECIALIZE(float) -//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double) +EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float) +// TODO handle complex cases +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex) +// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex) // SYRK for float/double -#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -80,23 +81,19 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ - char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \ - MKLTYPE alpha_, beta_; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ + char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ + EIGTYPE beta(1); \ + BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ } \ }; // HERK for complex data -#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \ +#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC) \ template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo> \ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \ enum { \ @@ -105,18 +102,15 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \ }; \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \ - const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \ + const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \ \ - MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \ - char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \ + BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ + char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'C':'N'); \ RTYPE alpha_, beta_; \ const EIGTYPE* a_ptr; \ \ -/* Set alpha_ & beta_ */ \ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\ -/* assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \ alpha_ = alpha.real(); \ beta_ = 1.0; \ /* Copy with conjugation in some cases*/ \ @@ -127,20 +121,21 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C lda = a.outerStride(); \ a_ptr = a.data(); \ } else a_ptr=lhs; \ - MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk) -EIGEN_MKL_RANKUPDATE_R(float, float, ssyrk) +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) -//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk) -//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8, double, cherk) +// TODO hanlde complex cases +// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) +// EIGEN_BLAS_RANKUPDATE_C(scomplex, float, float, cherk_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index 060af328e..7a3bdbf20 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-matrix product functionality based on ?GEMM. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H -#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H +#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -46,13 +46,15 @@ namespace internal { // gemm specialization -#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \ +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ template< \ typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \ { \ +typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \ +\ static void run(Index rows, Index cols, Index depth, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ @@ -64,55 +66,50 @@ static void run(Index rows, Index cols, Index depth, \ using std::conj; \ \ char transa, transb; \ - MKL_INT m, n, k, lda, ldb, ldc; \ + BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX a_tmp, b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ - k = (MKL_INT)depth; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ + k = convert_index<BlasIndex>(depth); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \ a_tmp = lhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _lhs; \ \ if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \ Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \ b_tmp = rhs.conjugate(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z) -GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, c) +GEMM_SPECIALIZATION(double, d, double, d) +GEMM_SPECIALIZATION(float, f, float, s) +GEMM_SPECIALIZATION(dcomplex, cd, double, z) +GEMM_SPECIALIZATION(scomplex, cf, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H +#endif // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 09387703e..3c1a7fc40 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H #define EIGEN_GENERAL_MATRIX_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -26,11 +26,39 @@ namespace internal { * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul + * + * Accesses to the matrix coefficients follow the following logic: + * + * - if all columns have the same alignment then + * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) + * - otherwise perform unaligned loads only (-> NoneAligned case) + * - otherwise + * - if even columns have the same alignment then + * // odd columns are guaranteed to have the same alignment too + * - if even or odd columns have the same alignment as the result, then + * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) + * - perform half aligned and half unaligned loads (-> EvenAligned case) + * - otherwise perform unaligned loads only (-> NoneAligned case) + * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then + * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, + * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) + * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h + * - otherwise, + * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), + * // we currently fall back to the NoneAligned case + * + * The same reasoning apply for the transposed case. + * + * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... + * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment + * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow + * compared to unaligned loads on a 4 byte boundary. + * */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { -typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable @@ -50,31 +78,35 @@ typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, - ResScalar* res, Index resIncr, RhsScalar alpha); + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, - ResScalar* res, Index resIncr, RhsScalar alpha) + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + RhsScalar alpha) { - EIGEN_UNUSED_VARIABLE(resIncr) + EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \ + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ pstore(&res[j], \ padd(pload<ResPacket>(&res[j]), \ padd( \ - padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]), ptmp0), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]), ptmp1)), \ - padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]), ptmp2), \ - pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]), ptmp3)) ))) + padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \ + pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \ + pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) ))) + + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; @@ -88,10 +120,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co const Index ResPacketAlignedMask = ResPacketSize-1; // const Index PeelAlignedMask = ResPacketSize*peels-1; const Index size = rows; - + + const Index lhsStride = lhs.stride(); + // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_aligned(res,size); + Index alignedStart = internal::first_default_aligned(res,size); Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; @@ -101,19 +135,26 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,size); + const Index lhsAlignmentOffset = lhs.firstAligned(size); // find how many columns do we have to skip to be aligned with the result (if possible) Index skipColumns = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) ) + if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; + } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); while (skipColumns<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) @@ -130,10 +171,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // note that the skiped columns are processed later. } - eigen_internal_assert( (alignmentPattern==NoneAligned) + /* eigen_internal_assert( (alignmentPattern==NoneAligned) || (skipColumns + columnsAtOnce >= cols) || LhsPacketSize > size - || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -142,20 +183,20 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; + const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]), - ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]), - ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]), - ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]); + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)), + ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)), + ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0)); // this helps a lot generating better binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); if (Vectorizable) { @@ -163,10 +204,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co // process initial unaligned coeffs for (Index j=0; j<alignedStart; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } if (alignedSize>alignedStart) @@ -175,11 +216,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -189,28 +230,28 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; ResPacket T0, T1; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*ResPacketSize) { - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - A00 = pload<LhsPacket>(&lhs0[j]); - A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]); + A00 = lhs0.template load<LhsPacket, Aligned>(j); + A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize); T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); T0 = pcj.pmadd(A03, ptmp3, T0); pstore(&res[j],T0); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); T1 = pcj.pmadd(A11, ptmp1, T1); T1 = pcj.pmadd(A12, ptmp2, T1); T1 = pcj.pmadd(A13, ptmp3, T1); @@ -218,12 +259,12 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co } } for (; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } } @@ -232,10 +273,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co /* process remaining coeffs (or all if there is no explicit vectorization) */ for (Index j=alignedSize; j<size; ++j) { - res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } } @@ -246,27 +287,27 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co { for (Index k=start; k<end; ++k) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]); - const LhsScalar* lhs0 = lhs + k*lhsStride; + RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); if (Vectorizable) { /* explicit vectorization */ // process first unaligned result's coeffs for (Index j=0; j<alignedStart; ++j) - res[j] += cj.pmul(lhs0[j], pfirst(ptmp0)); + res[j] += cj.pmul(lhs0(j), pfirst(ptmp0)); // process aligned result's coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i]))); else for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i]))); + pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i]))); } // process remaining scalars (or all if no explicit vectorization) for (Index i=alignedSize; i<size; ++i) - res[i] += cj.pmul(lhs0[i], pfirst(ptmp0)); + res[i] += cj.pmul(lhs0(i), pfirst(ptmp0)); } if (skipColumns) { @@ -290,10 +331,10 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co * - alpha is always a complex (or converted to a complex) * - no vectorization */ -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version> +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { -typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; +typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable @@ -310,73 +351,84 @@ typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; - + EIGEN_DONT_INLINE static void run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, - ResScalar* res, Index resIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, ResScalar alpha); }; -template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run( +template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> +EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsIncr, + const LhsMapper& lhs, + const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - EIGEN_UNUSED_VARIABLE(rhsIncr); - eigen_internal_assert(rhsIncr==1); + eigen_internal_assert(rhs.stride()==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS #error _EIGEN_ACCUMULATE_PACKETS has already been defined #endif - #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\ - RhsPacket b = pload<RhsPacket>(&rhs[j]); \ - ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \ - ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \ - ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \ - ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); } + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \ + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); } conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; + typedef typename LhsMapper::VectorMapper LhsScalars; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; const Index rowsAtOnce = 4; const Index peels = 2; const Index RhsPacketAlignedMask = RhsPacketSize-1; const Index LhsPacketAlignedMask = LhsPacketSize-1; -// const Index PeelAlignedMask = RhsPacketSize*peels-1; const Index depth = cols; + const Index lhsStride = lhs.stride(); // How many coeffs of the result do we have to skip to be aligned. // Here we assume data are at least aligned on the base scalar type // if that's not the case then vectorization is discarded, see below. - Index alignedStart = internal::first_aligned(rhs, depth); + Index alignedStart = rhs.firstAligned(depth); Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth); + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); // find how many rows do we have to skip to be aligned with rhs (if possible) Index skipRows = 0; // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) ) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || + (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || + (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) { alignedSize = 0; alignedStart = 0; + alignmentPattern = NoneAligned; + } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + alignmentPattern = NoneAligned; } else if (LhsPacketSize>1) { - eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); while (skipRows<LhsPacketSize && alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) @@ -392,11 +444,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co skipRows = (std::min)(skipRows,Index(rows)); // note that the skiped columns are processed later. } - eigen_internal_assert( alignmentPattern==NoneAligned + /* eigen_internal_assert( alignmentPattern==NoneAligned || LhsPacketSize==1 || (skipRows + rowsAtOnce >= rows) || LhsPacketSize > depth - || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0); + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } else if(Vectorizable) { @@ -405,18 +457,19 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co alignmentPattern = AllAligned; } - Index offset1 = (FirstAligned && alignmentStep==1?3:1); - Index offset3 = (FirstAligned && alignmentStep==1?1:3); + const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; + const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ?? + EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); // this helps the compiler generating good binary code - const LhsScalar *lhs0 = lhs + i*lhsStride, *lhs1 = lhs + (i+offset1)*lhsStride, - *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0), + lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0); if (Vectorizable) { @@ -428,9 +481,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } if (alignedSize>alignedStart) @@ -439,11 +492,11 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { case AllAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,d,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); break; case EvenAligned: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,d); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); break; case FirstAligned: { @@ -457,39 +510,39 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co * than basic unaligned loads. */ LhsPacket A01, A02, A03, A11, A12, A13; - A01 = pload<LhsPacket>(&lhs1[alignedStart-1]); - A02 = pload<LhsPacket>(&lhs2[alignedStart-2]); - A03 = pload<LhsPacket>(&lhs3[alignedStart-3]); + A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); + A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); + A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); for (; j<peeledSize; j+=peels*RhsPacketSize) { - RhsPacket b = pload<RhsPacket>(&rhs[j]); - A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]); palign<1>(A01,A11); - A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]); palign<2>(A02,A12); - A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]); palign<3>(A03,A13); + RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); + A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0); ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]); palign<1>(A11,A01); + A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]); palign<2>(A12,A02); + A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]); palign<3>(A13,A03); + A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); - b = pload<RhsPacket>(&rhs[j+RhsPacketSize]); - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0); + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0); ptmp1 = pcj.pmadd(A11, b, ptmp1); ptmp2 = pcj.pmadd(A12, b, ptmp2); ptmp3 = pcj.pmadd(A13, b, ptmp3); } } for (; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(d,du,du); + _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); break; } default: for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(du,du,du); + _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); break; } tmp0 += predux(ptmp0); @@ -503,9 +556,9 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) { - RhsScalar b = rhs[j]; - tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b); - tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b); + RhsScalar b = rhs(j, 0); + tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); + tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); } res[i*resIncr] += alpha*tmp0; res[(i+offset1)*resIncr] += alpha*tmp1; @@ -520,30 +573,30 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,Co { for (Index i=start; i<end; ++i) { - EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0); + EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); ResPacket ptmp0 = pset1<ResPacket>(tmp0); - const LhsScalar* lhs0 = lhs + i*lhsStride; + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); // process first unaligned result's coeffs // FIXME this loop get vectorized by the compiler ! for (Index j=0; j<alignedStart; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); if (alignedSize>alignedStart) { // process aligned rhs coeffs - if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0) + if (lhs0.template aligned<LhsPacket>(alignedStart)) for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); else for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0); + ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); tmp0 += predux(ptmp0); } // process remaining scalars // FIXME this loop get vectorized by the compiler ! for (Index j=alignedSize; j<depth; ++j) - tmp0 += cj.pmul(lhs0[j], rhs[j]); + tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); res[i*resIncr] += alpha*tmp0; } if (skipRows) diff --git a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index 1cb9fe6b5..e3a5d5892 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_MKL.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * General matrix-vector product functionality based on ?GEMV. ******************************************************************************** */ -#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H -#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H +#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -46,47 +46,46 @@ namespace internal { // gemv specialization -template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> -struct general_matrix_vector_product_gemv : - general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {}; +template<typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs> +struct general_matrix_vector_product_gemv; -#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar) \ template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ -struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \ +struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,Specialized> { \ static void run( \ Index rows, Index cols, \ - const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsIncr, \ + const const_blas_data_mapper<Scalar,Index,ColMajor> &lhs, \ + const const_blas_data_mapper<Scalar,Index,RowMajor> &rhs, \ Scalar* res, Index resIncr, Scalar alpha) \ { \ if (ConjugateLhs) { \ - general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \ - rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \ + general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ColMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,ConjugateRhs,BuiltIn>::run( \ + rows, cols, lhs, rhs, res, resIncr, alpha); \ } else { \ general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \ - rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ } \ } \ }; \ template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \ -struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \ +struct general_matrix_vector_product<Index,Scalar,const_blas_data_mapper<Scalar,Index,RowMajor>,RowMajor,ConjugateLhs,Scalar,const_blas_data_mapper<Scalar,Index,ColMajor>,ConjugateRhs,Specialized> { \ static void run( \ Index rows, Index cols, \ - const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsIncr, \ + const const_blas_data_mapper<Scalar,Index,RowMajor> &lhs, \ + const const_blas_data_mapper<Scalar,Index,ColMajor> &rhs, \ Scalar* res, Index resIncr, Scalar alpha) \ { \ general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \ - rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \ + rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha); \ } \ }; \ -EIGEN_MKL_GEMV_SPECIALIZE(double) -EIGEN_MKL_GEMV_SPECIALIZE(float) -EIGEN_MKL_GEMV_SPECIALIZE(dcomplex) -EIGEN_MKL_GEMV_SPECIALIZE(scomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(double) +EIGEN_BLAS_GEMV_SPECIALIZE(float) +EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ { \ @@ -98,16 +97,15 @@ static void run( \ const EIGTYPE* rhs, Index rhsIncr, \ EIGTYPE* res, Index resIncr, EIGTYPE alpha) \ { \ - MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex m=convert_index<BlasIndex>(rows), n=convert_index<BlasIndex>(cols), \ + lda=convert_index<BlasIndex>(lhsStride), incx=convert_index<BlasIndex>(rhsIncr), incy=convert_index<BlasIndex>(resIncr); \ + const EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \ if (LhsStorageOrder==RowMajor) { \ - m=cols; \ - n=rows; \ + m = convert_index<BlasIndex>(cols); \ + n = convert_index<BlasIndex>(rows); \ }\ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ GEMVVector x_tmp; \ if (ConjugateRhs) { \ Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \ @@ -115,17 +113,17 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_GEMV_SPECIALIZATION(double, double, d) -EIGEN_MKL_GEMV_SPECIALIZATION(float, float, s) -EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z) -EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, c) +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H +#endif // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index 6937ee332..c2f084c82 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -10,7 +10,7 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -49,8 +49,8 @@ inline void initParallel() { int nbt; internal::manage_multi_threading(GetAction, &nbt); - std::ptrdiff_t l1, l2; - internal::manage_caching_sizes(GetAction, &l1, &l2); + std::ptrdiff_t l1, l2, l3; + internal::manage_caching_sizes(GetAction, &l1, &l2, &l3); } /** \returns the max number of threads reserved for Eigen @@ -73,17 +73,17 @@ namespace internal { template<typename Index> struct GemmParallelInfo { - GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {} + GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} - int volatile sync; + Index volatile sync; int volatile users; - Index rhs_start; - Index rhs_length; + Index lhs_start; + Index lhs_length; }; template<bool Condition, typename Functor, typename Index> -void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose) +void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose) { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types @@ -92,6 +92,7 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole // parallelizer mechanism has to be redisigned anyway. + EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); #else @@ -102,56 +103,56 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpos // - we are not already in a parallel code // - the sizes are large enough - // 1- are we already in a parallel session? - // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? - if((!Condition) || (omp_get_num_threads()>1)) - return func(0,rows, 0,cols); + // compute the maximal number of threads from the size of the product: + // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once. + Index size = transpose ? rows : cols; + Index pb_max_threads = std::max<Index>(1,size / Functor::Traits::nr); - Index size = transpose ? cols : rows; + // compute the maximal number of threads from the total amount of work: + double work = static_cast<double>(rows) * static_cast<double>(cols) * + static_cast<double>(depth); + double kMinTaskSize = 50000; // FIXME improve this heuristic. + pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize)); - // 2- compute the maximal number of threads from the size of the product: - // FIXME this has to be fine tuned - Index max_threads = std::max<Index>(1,size / 32); + // compute the number of threads we are going to use + Index threads = std::min<Index>(nbThreads(), pb_max_threads); - // 3 - compute the number of threads we are going to use - Index threads = std::min<Index>(nbThreads(), max_threads); - - if(threads==1) + // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // then abort multi-threading + // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? + if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) return func(0,rows, 0,cols); Eigen::initParallel(); - func.initParallelSession(); + func.initParallelSession(threads); if(transpose) std::swap(rows,cols); - GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads]; + ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0); #pragma omp parallel num_threads(threads) { Index i = omp_get_thread_num(); // Note that the actual number of threads might be lower than the number of request ones. Index actual_threads = omp_get_num_threads(); - + Index blockCols = (cols / actual_threads) & ~Index(0x3); - Index blockRows = (rows / actual_threads) & ~Index(0x7); - + Index blockRows = (rows / actual_threads); + blockRows = (blockRows/Functor::Traits::mr)*Functor::Traits::mr; + Index r0 = i*blockRows; Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows; Index c0 = i*blockCols; Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols; - info[i].rhs_start = c0; - info[i].rhs_length = actualBlockCols; + info[i].lhs_start = r0; + info[i].lhs_length = actualBlockRows; - if(transpose) - func(0, cols, r0, actualBlockRows, info); - else - func(r0, actualBlockRows, 0,cols, info); + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); } - - delete[] info; #endif } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 99cf9e0ae..da6f82abc 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { // pack a selfadjoint block diagonal for use with the gebp_kernel -template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder> +template<typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder> struct symm_pack_lhs { template<int BlockRows> inline @@ -45,25 +45,32 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { + enum { PacketSize = packet_traits<Scalar>::size }; const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; - Index peeled_mc = (rows/Pack1)*Pack1; - for(Index i=0; i<peeled_mc; i+=Pack1) - { - pack<Pack1>(blockA, lhs, cols, i, count); - } - - if(rows-peeled_mc>=Pack2) - { - pack<Pack2>(blockA, lhs, cols, peeled_mc, count); - peeled_mc += Pack2; - } + //Index peeled_mc3 = (rows/Pack1)*Pack1; + + const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; + const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + + if(Pack1>=3*PacketSize) + for(Index i=0; i<peeled_mc3; i+=3*PacketSize) + pack<3*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=2*PacketSize) + for(Index i=peeled_mc3; i<peeled_mc2; i+=2*PacketSize) + pack<2*PacketSize>(blockA, lhs, cols, i, count); + + if(Pack1>=1*PacketSize) + for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) + pack<1*PacketSize>(blockA, lhs, cols, i, count); // do the same with mr==1 - for(Index i=peeled_mc; i<rows; i++) + for(Index i=peeled_mc1; i<rows; i++) { for(Index k=0; k<i; k++) - blockA[count++] = lhs(i, k); // normal + blockA[count++] = lhs(i, k); // normal blockA[count++] = numext::real(lhs(i, i)); // real (diagonal) @@ -82,7 +89,8 @@ struct symm_pack_rhs Index end_k = k2 + rows; Index count = 0; const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride); - Index packet_cols = (cols/nr)*nr; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; // first part: normal case for(Index j2=0; j2<k2; j2+=nr) @@ -91,79 +99,151 @@ struct symm_pack_rhs { blockB[count+0] = rhs(k,j2+0); blockB[count+1] = rhs(k,j2+1); - if (nr==4) + if (nr>=4) { blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); } + if (nr>=8) + { + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + } count += nr; } } // second part: diagonal block - for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr) + Index end8 = nr>=8 ? (std::min)(k2+rows,packet_cols8) : k2; + if(nr>=8) { - // again we can split vertically in three different parts (transpose, symmetric, normal) - // transpose - for(Index k=k2; k<j2; k++) + for(Index j2=k2; j2<end8; j2+=8) { - blockB[count+0] = numext::conj(rhs(j2+0,k)); - blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; } - count += nr; - } - // symmetric - Index h = 0; - for(Index k=j2; k<j2+nr; k++) - { - // normal - for (Index w=0 ; w<h; ++w) - blockB[count+w] = rhs(k,j2+w); + // symmetric + Index h = 0; + for(Index k=j2; k<j2+8; k++) + { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); - blockB[count+h] = numext::real(rhs(k,k)); + blockB[count+h] = numext::real(rhs(k,k)); - // transpose - for (Index w=h+1 ; w<nr; ++w) - blockB[count+w] = numext::conj(rhs(j2+w,k)); - count += nr; - ++h; + // transpose + for (Index w=h+1 ; w<8; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 8; + ++h; + } + // normal + for(Index k=j2+8; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); + blockB[count+2] = rhs(k,j2+2); + blockB[count+3] = rhs(k,j2+3); + blockB[count+4] = rhs(k,j2+4); + blockB[count+5] = rhs(k,j2+5); + blockB[count+6] = rhs(k,j2+6); + blockB[count+7] = rhs(k,j2+7); + count += 8; + } } - // normal - for(Index k=j2+nr; k<end_k; k++) + } + if(nr>=4) + { + for(Index j2=end8; j2<(std::min)(k2+rows,packet_cols4); j2+=4) { - blockB[count+0] = rhs(k,j2+0); - blockB[count+1] = rhs(k,j2+1); - if (nr==4) + // again we can split vertically in three different parts (transpose, symmetric, normal) + // transpose + for(Index k=k2; k<j2; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; + } + // symmetric + Index h = 0; + for(Index k=j2; k<j2+4; k++) { + // normal + for (Index w=0 ; w<h; ++w) + blockB[count+w] = rhs(k,j2+w); + + blockB[count+h] = numext::real(rhs(k,k)); + + // transpose + for (Index w=h+1 ; w<4; ++w) + blockB[count+w] = numext::conj(rhs(j2+w,k)); + count += 4; + ++h; + } + // normal + for(Index k=j2+4; k<end_k; k++) + { + blockB[count+0] = rhs(k,j2+0); + blockB[count+1] = rhs(k,j2+1); blockB[count+2] = rhs(k,j2+2); blockB[count+3] = rhs(k,j2+3); + count += 4; } - count += nr; } } // third part: transposed - for(Index j2=k2+rows; j2<packet_cols; j2+=nr) + if(nr>=8) { - for(Index k=k2; k<end_k; k++) + for(Index j2=k2+rows; j2<packet_cols8; j2+=8) { - blockB[count+0] = numext::conj(rhs(j2+0,k)); - blockB[count+1] = numext::conj(rhs(j2+1,k)); - if (nr==4) + for(Index k=k2; k<end_k; k++) { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); blockB[count+2] = numext::conj(rhs(j2+2,k)); blockB[count+3] = numext::conj(rhs(j2+3,k)); + blockB[count+4] = numext::conj(rhs(j2+4,k)); + blockB[count+5] = numext::conj(rhs(j2+5,k)); + blockB[count+6] = numext::conj(rhs(j2+6,k)); + blockB[count+7] = numext::conj(rhs(j2+7,k)); + count += 8; + } + } + } + if(nr>=4) + { + for(Index j2=(std::max)(packet_cols8,k2+rows); j2<packet_cols4; j2+=4) + { + for(Index k=k2; k<end_k; k++) + { + blockB[count+0] = numext::conj(rhs(j2+0,k)); + blockB[count+1] = numext::conj(rhs(j2+1,k)); + blockB[count+2] = numext::conj(rhs(j2+2,k)); + blockB[count+3] = numext::conj(rhs(j2+3,k)); + count += 4; } - count += nr; } } // copy the remaining columns one at a time (=> the same with nr==1) - for(Index j2=packet_cols; j2<cols; ++j2) + for(Index j2=packet_cols4; j2<cols; ++j2) { // transpose Index half = (std::min)(end_k,j2); @@ -211,7 +291,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha) + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { product_selfadjoint_matrix<Scalar, Index, EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor, @@ -219,7 +299,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha); + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); } }; @@ -234,7 +314,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha); + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, @@ -244,33 +324,35 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, - const Scalar& alpha) + Scalar* _res, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = rows; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; - Index kc = size; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); - // kc must smaller than mc + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + LhsTransposeMapper lhs_transpose(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // kc must be smaller than mc kc = (std::min)(kc,mc); + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; - - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; for(Index k2=0; k2<size; k2+=kc) { @@ -279,7 +361,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // we have selected one row panel of rhs and one column panel of lhs // pack rhs's panel into a sequential chunk of memory // and expand each coeff to a constant packet for further reuse - pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(k2,0), actual_kc, cols); // the select lhs's panel has to be split in three different parts: // 1 - the transposed panel above the diagonal block => transposed packed copy @@ -289,9 +371,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t { const Index actual_mc = (std::min)(i2+mc,k2)-i2; // transposed packed copy - pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc); + pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } // the block diagonal { @@ -299,16 +381,16 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // symmetric packed copy pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc); - gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } for(Index i2=k2+kc; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -325,7 +407,7 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, - const Scalar& alpha); + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, @@ -335,27 +417,27 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, - const Scalar& alpha) + Scalar* _res, Index resStride, + const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - typedef gebp_traits<Scalar,Scalar> Traits; - Index kc = size; // cache block size along the K direction - Index mc = rows; // cache block size along the M direction - Index nc = cols; // cache block size along the N direction - computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc); - std::size_t sizeW = kc*Traits::WorkSpaceFactor; - std::size_t sizeB = sizeW + kc*cols; - ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0); - Scalar* blockB = allocatedBlockB + sizeW; - - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + ResMapper res(_res,resStride); + + Index kc = blocking.kc(); // cache block size along the K direction + Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + std::size_t sizeA = kc*mc; + std::size_t sizeB = kc*cols; + ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); + ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); + + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=0; k2<size; k2+=kc) @@ -368,9 +450,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f for(Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,rows)-i2; - pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); } } } @@ -382,55 +464,58 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f ***************************************************************************/ namespace internal { + template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> -struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> > - : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> > -{}; -} - -template<typename Lhs, int LhsMode, typename Rhs, int RhsMode> -struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> - : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs > +struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + enum { LhsIsUpper = (LhsMode&(Upper|Lower))==Upper, LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint, RhsIsUpper = (RhsMode&(Upper|Lower))==Upper, RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint }; - - template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const + + template<typename Dest> + static void run(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { - eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + eigen_assert(dst.rows()==a_lhs.rows() && dst.cols()==a_rhs.cols()); - typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); - typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); + + typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, + Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,1> BlockingType; + + BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false); internal::product_selfadjoint_matrix<Scalar, Index, - EIGEN_LOGICAL_XOR(LhsIsUpper, - internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, + EIGEN_LOGICAL_XOR(LhsIsUpper,internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), - EIGEN_LOGICAL_XOR(RhsIsUpper, - internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, + EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> ::run( lhs.rows(), rhs.cols(), // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info - actualAlpha // alpha + actualAlpha, blocking // alpha ); } }; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index dfa687fef..a45238d69 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -40,7 +40,7 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -52,28 +52,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (LhsStorageOrder==RowMajor) uplo='U'; \ @@ -83,16 +78,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = rhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -103,36 +98,31 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='L', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set transpose options */ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)lhsStride; \ - ldb = (MKL_INT)rhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ + ldb = convert_index<BlasIndex>(rhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \ a_tmp = lhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _lhs; \ if (LhsStorageOrder==RowMajor) uplo='U'; \ \ @@ -151,23 +141,23 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh b_tmp = rhs.transpose(); \ } \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_MKL_SYMM_L(double, double, d, d) -EIGEN_MKL_SYMM_L(float, float, f, s) -EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_L(double, double, d, d) +EIGEN_BLAS_SYMM_L(float, float, f, s) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -179,27 +169,22 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ - EIGTYPE myone(1);\ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (RhsStorageOrder==RowMajor) uplo='U'; \ @@ -209,16 +194,16 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \ b_tmp = lhs.adjoint(); \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _lhs; \ \ - MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -229,35 +214,30 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ EIGTYPE* res, Index resStride, \ - EIGTYPE alpha) \ + EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ char side='R', uplo='L'; \ - MKL_INT m, n, lda, ldb, ldc; \ + BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ - MKLTYPE alpha_, beta_; \ + EIGTYPE beta(1); \ MatrixX##EIGPREFIX b_tmp; \ Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \ - EIGTYPE myone(1); \ \ /* Set m, n, k */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)cols; \ -\ -/* Set alpha_ & beta_ */ \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set lda, ldb, ldc */ \ - lda = (MKL_INT)rhsStride; \ - ldb = (MKL_INT)lhsStride; \ - ldc = (MKL_INT)resStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ + ldb = convert_index<BlasIndex>(lhsStride); \ + ldc = convert_index<BlasIndex>(resStride); \ \ /* Set a, b, c */ \ if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \ Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \ a_tmp = rhs.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else a = _rhs; \ if (RhsStorageOrder==RowMajor) uplo='U'; \ \ @@ -276,20 +256,20 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL b_tmp = lhs.transpose(); \ } \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \ + BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_MKL_SYMM_R(double, double, d, d) -EIGEN_MKL_SYMM_R(float, float, f, s) -EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_SYMM_R(double, double, d, d) +EIGEN_BLAS_SYMM_R(float, float, f, s) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index f698f67f9..3fd180e6c 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -30,7 +30,7 @@ struct selfadjoint_matrix_vector_product static EIGEN_DONT_INLINE void run( Index size, const Scalar* lhs, Index lhsStride, - const Scalar* _rhs, Index rhsIncr, + const Scalar* rhs, Scalar* res, Scalar alpha); }; @@ -39,11 +39,12 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( Index size, const Scalar* lhs, Index lhsStride, - const Scalar* _rhs, Index rhsIncr, + const Scalar* rhs, Scalar* res, Scalar alpha) { typedef typename packet_traits<Scalar>::type Packet; + typedef typename NumTraits<Scalar>::Real RealScalar; const Index PacketSize = sizeof(Packet)/sizeof(Scalar); enum { @@ -54,23 +55,13 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0; conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1; - conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd; + conj_helper<RealScalar,Scalar,false, ConjugateRhs> cjd; conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0; conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1; Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; - // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed. - // if the rhs is not sequentially stored in memory we copy it to a temporary buffer, - // this is because we need to extract packets - ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0); - if (rhsIncr!=1) - { - const Scalar* it = _rhs; - for (Index i=0; i<size; ++i, it+=rhsIncr) - rhs[i] = *it; - } Index bound = (std::max)(Index(0),size-8) & 0xfffffffe; if (FirstTriangular) @@ -92,12 +83,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd Scalar t3(0); Packet ptmp3 = pset1<Packet>(t3); - size_t starti = FirstTriangular ? 0 : j+2; - size_t endi = FirstTriangular ? j : size; - size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti); - size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); + Index starti = FirstTriangular ? 0 : j+2; + Index endi = FirstTriangular ? j : size; + Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); + Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); - // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed res[j] += cjd.pmul(numext::real(A0[j]), t0); res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); if(FirstTriangular) @@ -111,11 +101,11 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd t2 += cj1.pmul(A0[j+1], rhs[j+1]); } - for (size_t i=starti; i<alignedStart; ++i) + for (Index i=starti; i<alignedStart; ++i) { - res[i] += t0 * A0[i] + t1 * A1[i]; - t2 += numext::conj(A0[i]) * rhs[i]; - t3 += numext::conj(A1[i]) * rhs[i]; + res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); + t2 += cj1.pmul(A0[i], rhs[i]); + t3 += cj1.pmul(A1[i], rhs[i]); } // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up) // gcc 4.2 does this optimization automatically. @@ -123,7 +113,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart; const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart; Scalar* EIGEN_RESTRICT resIt = res + alignedStart; - for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize) + for (Index i=alignedStart; i<alignedEnd; i+=PacketSize) { Packet A0i = ploadu<Packet>(a0It); a0It += PacketSize; Packet A1i = ploadu<Packet>(a1It); a1It += PacketSize; @@ -135,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3); pstore(resIt,Xi); resIt += PacketSize; } - for (size_t i=alignedEnd; i<endi; i++) + for (Index i=alignedEnd; i<endi; i++) { res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1); t2 += cj1.pmul(A0[i], rhs[i]); @@ -151,7 +141,6 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd Scalar t1 = cjAlpha * rhs[j]; Scalar t2(0); - // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed res[j] += cjd.pmul(numext::real(A0[j]), t1); for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++) { @@ -169,45 +158,44 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd ***************************************************************************/ namespace internal { -template<typename Lhs, int LhsMode, typename Rhs> -struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> > - : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> > -{}; -} template<typename Lhs, int LhsMode, typename Rhs> -struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> - : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs > +struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) - - enum { - LhsUpLo = LhsMode&(Upper|Lower) - }; - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; + + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + enum { LhsUpLo = LhsMode&(Upper|Lower) }; + + template<typename Dest> + static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; - typedef typename Base::RhsScalar RhsScalar; - typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; + typedef typename Rhs::Scalar RhsScalar; + typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest; - eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols()); + eigen_assert(dest.rows()==a_lhs.rows() && dest.cols()==a_rhs.cols()); - typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); - typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); enum { EvalToDest = (Dest::InnerStrideAtCompileTime==1), - UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1) + UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime==1) }; internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest; - internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs; + internal::gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!UseRhs> static_rhs; ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), EvalToDest ? dest.data() : static_dest.data()); @@ -218,7 +206,7 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> if(!EvalToDest) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - int size = dest.size(); + Index size = dest.size(); EIGEN_DENSE_STORAGE_CTOR_PLUGIN #endif MappedDest(actualDestPtr, dest.size()) = dest; @@ -227,18 +215,19 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> if(!UseRhs) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - int size = rhs.size(); + Index size = rhs.size(); EIGEN_DENSE_STORAGE_CTOR_PLUGIN #endif - Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs; + Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs; } - internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run + internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, + int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run ( lhs.rows(), // size &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - actualRhsPtr, 1, // rhs info + actualRhsPtr, // rhs info actualDestPtr, // result info actualAlpha // scale factor ); @@ -248,34 +237,24 @@ struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> } }; -namespace internal { -template<typename Lhs, typename Rhs, int RhsMode> -struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> > - : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> > -{}; -} - template<typename Lhs, typename Rhs, int RhsMode> -struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> - : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs > +struct selfadjoint_product_impl<Lhs,0,true,Rhs,RhsMode,false> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix) + typedef typename Product<Lhs,Rhs>::Scalar Scalar; + enum { RhsUpLo = RhsMode&(Upper|Lower) }; - enum { - RhsUpLo = RhsMode&(Upper|Lower) - }; - - SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const + template<typename Dest> + static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { // let's simply transpose the product Transpose<Dest> destT(dest); - SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false, - Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha); + selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false, + Transpose<const Lhs>, 0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha); } }; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 86684b66d..38f23accf 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV. ******************************************************************************** */ -#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H -#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H +#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,31 +47,31 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju struct selfadjoint_matrix_vector_product_symv : selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {}; -#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \ static void run( \ Index size, const Scalar* lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \ + const Scalar* _rhs, Scalar* res, Scalar alpha) { \ enum {\ IsColMajor = StorageOrder==ColMajor \ }; \ if (IsColMajor == ConjugateLhs) {\ selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ } else {\ selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \ - size, lhs, lhsStride, _rhs, rhsIncr, res, alpha); \ + size, lhs, lhsStride, _rhs, res, alpha); \ }\ } \ }; \ -EIGEN_MKL_SYMV_SPECIALIZE(double) -EIGEN_MKL_SYMV_SPECIALIZE(float) -EIGEN_MKL_SYMV_SPECIALIZE(dcomplex) -EIGEN_MKL_SYMV_SPECIALIZE(scomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(double) +EIGEN_BLAS_SYMV_SPECIALIZE(float) +EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_SYMV_SPECIALIZE(scomplex) -#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \ +#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \ struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \ { \ @@ -79,36 +79,33 @@ typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\ \ static void run( \ Index size, const EIGTYPE* lhs, Index lhsStride, \ -const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \ +const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ { \ enum {\ IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \ IsLower = UpLo == Lower ? 1 : 0 \ }; \ - MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \ - MKLTYPE alpha_, beta_; \ - const EIGTYPE *x_ptr, myone(1); \ + BlasIndex n=convert_index<BlasIndex>(size), lda=convert_index<BlasIndex>(lhsStride), incx=1, incy=1; \ + EIGTYPE beta(1); \ + const EIGTYPE *x_ptr; \ char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \ - assign_scalar_eig2mkl(alpha_, alpha); \ - assign_scalar_eig2mkl(beta_, myone); \ SYMVVector x_tmp; \ if (ConjugateRhs) { \ - Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \ + Map<const SYMVVector, 0 > map_x(_rhs,size,1); \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ - incx=1; \ } else x_ptr=_rhs; \ - MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_MKL_SYMV_SPECIALIZATION(double, double, dsymv) -EIGEN_MKL_SYMV_SPECIALIZATION(float, float, ssymv) -EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) -EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H +#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index 6ca4ae6c0..f038d686f 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -53,7 +53,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true> static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) { typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::Index Index; typedef internal::blas_traits<OtherType> OtherBlasTraits; typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; @@ -86,7 +85,6 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) { typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::Index Index; typedef internal::blas_traits<OtherType> OtherBlasTraits; typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType; typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType; @@ -94,15 +92,27 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived()); - enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 }; + enum { + IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0, + OtherIsRowMajor = _ActualOtherType::Flags&RowMajorBit ? 1 : 0 + }; + + Index size = mat.cols(); + Index depth = actualOther.cols(); + + typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor,Scalar,Scalar, + MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime, _ActualOtherType::MaxColsAtCompileTime> BlockingType; + + BlockingType blocking(size, size, depth, 1, false); + internal::general_matrix_matrix_triangular_product<Index, - Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, - Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, - MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo> - ::run(mat.cols(), actualOther.cols(), + Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, + Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, + IsRowMajor ? RowMajor : ColMajor, UpLo> + ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha); + mat.data(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 8594a97ce..2ae364111 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -79,11 +79,11 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - internal::selfadjoint_rank2_update_selector<Scalar, Index, - typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type, - typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type, + typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType; + typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType; + internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> - ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha); + ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); return *this; } diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8110507b5..6ec5a8a0b 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -108,7 +108,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -117,30 +117,36 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index depth = IsLower ? diagSize : _depth; Index cols = _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction + // The small panel size must not be larger than blocking size. + // Usually this should never be the case because SmallPanelWidth^2 is very small + // compared to L2 cache size, but let's be safe: + Index panelWidth = (std::min)(Index(SmallPanelWidth),(std::min)(kc,mc)); std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=IsLower ? depth : 0; IsLower ? k2>0 : k2<depth; @@ -156,7 +162,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, k2 = k2+actual_kc-kc; } - pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols); + pack_rhs(blockB, rhs.getSubMapper(actual_k2,0), actual_kc, cols); // the selected lhs's panel has to be split in three different parts: // 1 - the part which is zero => skip it @@ -167,9 +173,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, if(IsLower || actual_k2<rows) { // for each small vertical panels of lhs - for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth) + for (Index k1=0; k1<actual_kc; k1+=panelWidth) { - Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth); + Index actualPanelWidth = std::min<Index>(actual_kc-k1, panelWidth); Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1; Index startBlock = actual_k2+k1; Index blockBOffset = k1; @@ -184,20 +190,22 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i) triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k); } - pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth); + pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth); - gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB, + actualPanelWidth, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); // GEBP with remaining micro panel if (lengthTarget>0) { Index startTarget = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2; - pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, lhs.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha, - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, + lengthTarget, actualPanelWidth, cols, alpha, + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -208,10 +216,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for(Index i2=start; i2<end; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,end)-i2; - gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() - (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() + (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); - gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW); + gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, + actual_kc, cols, alpha, -1, -1, 0, 0); } } } @@ -249,40 +258,43 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* _res, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { + const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar); // strip zeros Index diagSize = (std::min)(_cols,_depth); Index rows = _rows; Index depth = IsLower ? _depth : diagSize; Index cols = IsLower ? diagSize : _cols; - const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride); - const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride); + typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + LhsMapper lhs(_lhs,lhsStride); + RhsMapper rhs(_rhs,rhsStride); + ResMapper res(_res, resStride); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction std::size_t sizeA = kc*mc; - std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; + std::size_t sizeB = kc*cols+EIGEN_MAX_ALIGN_BYTES/sizeof(Scalar); ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); else triangularBuffer.diagonal().setOnes(); - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; + gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; for(Index k2=IsLower ? 0 : depth; IsLower ? k2<depth : k2>0; @@ -304,8 +316,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc; Scalar* geb = blockB+ts*ts; + geb = geb + internal::first_aligned<PacketBytes>(geb,PacketBytes/sizeof(Scalar)); - pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs); + pack_rhs(geb, rhs.getSubMapper(actual_k2,IsLower ? 0 : k2), actual_kc, rs); // pack the triangular part of the rhs padding the unrolled blocks with zeros if(ts>0) @@ -318,7 +331,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2; // general part pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), rhsStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); @@ -332,7 +345,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, } pack_rhs_panel(blockB+j2*actual_kc, - triangularBuffer.data(), triangularBuffer.outerStride(), + RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth, actualPanelWidth, actual_kc, j2); } @@ -341,7 +354,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, for (Index i2=0; i2<rows; i2+=mc) { const Index actual_mc = (std::min)(mc,rows-i2); - pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc); + pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); // triangular kernel if(ts>0) @@ -352,19 +365,18 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth; Index blockOffset = IsLower ? j2 : 0; - gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, alpha, actual_kc, actual_kc, // strides - blockOffset, blockOffset,// offsets - blockW); // workspace + blockOffset, blockOffset);// offsets } } - gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride, + gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), blockA, geb, actual_mc, actual_kc, rs, alpha, - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } @@ -373,28 +385,28 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, * Wrapper to product_triangular_matrix_matrix ***************************************************************************/ -template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> -struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> > - : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> > -{}; - } // end namespace internal +namespace internal { template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> -struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> - : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs > +struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) - - TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const + template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs); - typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs); + typedef typename Dest::Scalar Scalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef typename internal::remove_all<ActualLhsType>::type ActualLhsTypeCleaned; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs) - * RhsBlasTraits::extractScalarFactor(m_rhs); + Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) + * RhsBlasTraits::extractScalarFactor(a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -405,23 +417,25 @@ struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows())) : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols())); - BlockingType blocking(stripedRows, stripedCols, stripedDepth); + BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false); internal::product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, - (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, - (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, + (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, + (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> ::run( stripedRows, stripedCols, stripedDepth, // sizes - &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info - &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info + &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info + &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info &dst.coeffRef(0,0), dst.outerStride(), // result info actualAlpha, blocking ); } }; +} // end namespace internal + } // end namespace Eigen #endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index ba41a1c99..aecded6bb 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H namespace Eigen { @@ -50,7 +50,7 @@ struct product_triangular_matrix_matrix_trmm : // try to go to BLAS specialization -#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ +#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -65,17 +65,17 @@ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ } \ }; -EIGEN_MKL_TRMM_SPECIALIZE(double, true) -EIGEN_MKL_TRMM_SPECIALIZE(double, false) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false) -EIGEN_MKL_TRMM_SPECIALIZE(float, true) -EIGEN_MKL_TRMM_SPECIALIZE(float, false) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true) -EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(double, true) +EIGEN_BLAS_TRMM_SPECIALIZE(double, false) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false) +EIGEN_BLAS_TRMM_SPECIALIZE(float, true) +EIGEN_BLAS_TRMM_SPECIALIZE(float, false) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) +EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -106,13 +106,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (rows != depth) { \ \ - int nthr = mkl_domain_get_max_threads(MKL_BLAS); \ + /* FIXME handle mkl_domain_get_max_threads */ \ + /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;\ \ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -121,27 +122,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ /* Make sense to call GEMM */ \ Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ - gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ + gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'L', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)diagSize; \ - n = (MKL_INT)cols; \ + m = convert_index<BlasIndex>(diagSize); \ + n = convert_index<BlasIndex>(cols); \ \ /* Set trans */ \ transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \ @@ -152,7 +149,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ \ if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -168,14 +165,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _lhs; \ - lda = lhsStride; \ + lda = convert_index<BlasIndex>(lhsStride); \ } \ - /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -183,13 +180,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ } \ }; -EIGEN_MKL_TRMM_L(double, double, d, d) -EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_L(float, float, f, s) -EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_L(double, double, d, d) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_L(float, float, f, s) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) // implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -220,13 +217,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \ typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \ \ -/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \ +/* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/ \ if (cols != depth) { \ \ - int nthr = mkl_domain_get_max_threads(MKL_BLAS); \ + int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/; \ \ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ - /* Most likely no benefit to call TRMM or GEMM from MKL*/ \ + /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ @@ -235,27 +232,23 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ /* Make sense to call GEMM */ \ Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ - MKL_INT aStride = aa_tmp.outerStride(); \ - gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \ + BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ + gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ \ - /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \ + /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ return; \ } \ char side = 'R', transa, uplo, diag = 'N'; \ EIGTYPE *b; \ const EIGTYPE *a; \ - MKL_INT m, n, lda, ldb; \ - MKLTYPE alpha_; \ -\ -/* Set alpha_*/ \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ + BlasIndex m, n, lda, ldb; \ \ /* Set m, n */ \ - m = (MKL_INT)rows; \ - n = (MKL_INT)diagSize; \ + m = convert_index<BlasIndex>(rows); \ + n = convert_index<BlasIndex>(diagSize); \ \ /* Set trans */ \ transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \ @@ -266,7 +259,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ \ if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \ b = b_tmp.data(); \ - ldb = b_tmp.outerStride(); \ + ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ \ /* Set uplo */ \ uplo = IsLower ? 'L' : 'U'; \ @@ -282,14 +275,14 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ else if (IsUnitDiag) \ a_tmp.diagonal().setOnes();\ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _rhs; \ - lda = rhsStride; \ + lda = convert_index<BlasIndex>(rhsStride); \ } \ - /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \ + /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \ + BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -297,13 +290,13 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ } \ }; -EIGEN_MKL_TRMM_R(double, double, d, d) -EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMM_R(float, float, f, s) -EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMM_R(double, double, d, d) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) +EIGEN_BLAS_TRMM_R(float, float, f, s) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 6117d5a82..4b292e74d 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H #define EIGEN_TRIANGULARMATRIXVECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -20,20 +20,20 @@ struct triangular_matrix_vector_product; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> { - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { IsLower = ((Mode&Lower)==Lower), HasUnitDiag = (Mode & UnitDiag)==UnitDiag, HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag }; static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha); + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha); }; template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version> EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version> ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; Index size = (std::min)(_rows,_cols); @@ -43,7 +43,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs); - + typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap; const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs); @@ -51,6 +51,9 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap; ResMap res(_res,rows); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<size; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, size-pi); @@ -68,19 +71,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? pi+actualPanelWidth : 0; - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( r, actualPanelWidth, - &lhs.coeffRef(s,pi), lhsStride, - &rhs.coeffRef(pi), rhsIncr, + LhsMapper(&lhs.coeffRef(s,pi), lhsStride), + RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr, alpha); } } if((!IsLower) && cols>size) { - general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows, cols-size, - &lhs.coeffRef(0,size), lhsStride, - &rhs.coeffRef(size), rhsIncr, + LhsMapper(&lhs.coeffRef(0,size), lhsStride), + RhsMapper(&rhs.coeffRef(size), rhsIncr), _res, resIncr, alpha); } } @@ -88,7 +91,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version> struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version> { - typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; enum { IsLower = ((Mode&Lower)==Lower), HasUnitDiag = (Mode & UnitDiag)==UnitDiag, @@ -118,7 +121,10 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap; ResMap res(_res,rows,InnerStride<>(resIncr)); - + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper; + for (Index pi=0; pi<diagSize; pi+=PanelWidth) { Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi); @@ -136,19 +142,19 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con if (r>0) { Index s = IsLower ? 0 : pi + actualPanelWidth; - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs,BuiltIn>::run( actualPanelWidth, r, - &lhs.coeffRef(pi,s), lhsStride, - &rhs.coeffRef(s), rhsIncr, + LhsMapper(&lhs.coeffRef(pi,s), lhsStride), + RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr, alpha); } } if(IsLower && rows>diagSize) { - general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjLhs,RhsScalar,RhsMapper,ConjRhs>::run( rows-diagSize, cols, - &lhs.coeffRef(diagSize,0), lhsStride, - &rhs.coeffRef(0), rhsIncr, + LhsMapper(&lhs.coeffRef(diagSize,0), lhsStride), + RhsMapper(&rhs.coeffRef(0), rhsIncr), &res.coeffRef(diagSize), resIncr, alpha); } } @@ -157,83 +163,66 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con * Wrapper to product_triangular_vector ***************************************************************************/ -template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> -struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> > - : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> > -{}; - -template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs> -struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> > - : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> > -{}; - - -template<int StorageOrder> +template<int Mode,int StorageOrder> struct trmv_selector; } // end namespace internal +namespace internal { + template<int Mode, typename Lhs, typename Rhs> -struct TriangularProduct<Mode,true,Lhs,false,Rhs,true> - : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs > +struct triangular_product_impl<Mode,true,Lhs,false,Rhs,true> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) - - TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const + template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha) { - eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols()); - internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha); + internal::trmv_selector<Mode,(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(lhs, rhs, dst, alpha); } }; template<int Mode, typename Lhs, typename Rhs> -struct TriangularProduct<Mode,false,Lhs,true,Rhs,false> - : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs > +struct triangular_product_impl<Mode,false,Lhs,true,Rhs,false> { - EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct) - - TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {} - - template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const + template<typename Dest> static void run(Dest& dst, const Lhs &lhs, const Rhs &rhs, const typename Dest::Scalar& alpha) { - eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols()); + eigen_assert(dst.rows()==lhs.rows() && dst.cols()==rhs.cols()); - typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose; Transpose<Dest> dstT(dst); - internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run( - TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha); + internal::trmv_selector<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower), + (int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor> + ::run(rhs.transpose(),lhs.transpose(), dstT, alpha); } }; +} // end namespace internal + namespace internal { // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same. -template<> struct trmv_selector<ColMajor> +template<int Mode> struct trmv_selector<Mode,ColMajor> { - template<int Mode, typename Lhs, typename Rhs, typename Dest> - static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha) + template<typename Lhs, typename Rhs, typename Dest> + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) { - typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType; - typedef typename ProductType::Index Index; - typedef typename ProductType::LhsScalar LhsScalar; - typedef typename ProductType::RhsScalar RhsScalar; - typedef typename ProductType::Scalar ResScalar; - typedef typename ProductType::RealScalar RealScalar; - typedef typename ProductType::ActualLhsType ActualLhsType; - typedef typename ProductType::ActualRhsType ActualRhsType; - typedef typename ProductType::LhsBlasTraits LhsBlasTraits; - typedef typename ProductType::RhsBlasTraits RhsBlasTraits; - typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest; - - typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs()); - typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs()); - - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + typedef typename Dest::RealScalar RealScalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + + typedef Map<Matrix<ResScalar,Dynamic,1>, EIGEN_PLAIN_ENUM_MIN(AlignedMax,internal::packet_traits<ResScalar>::size)> MappedDest; + + typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); + typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) + * RhsBlasTraits::extractScalarFactor(rhs); enum { // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 @@ -247,7 +236,7 @@ template<> struct trmv_selector<ColMajor> bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0)); bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible; - + RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha); ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(), @@ -267,7 +256,7 @@ template<> struct trmv_selector<ColMajor> else MappedDest(actualDestPtr, dest.size()) = dest; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, @@ -288,33 +277,32 @@ template<> struct trmv_selector<ColMajor> } }; -template<> struct trmv_selector<RowMajor> +template<int Mode> struct trmv_selector<Mode,RowMajor> { - template<int Mode, typename Lhs, typename Rhs, typename Dest> - static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha) + template<typename Lhs, typename Rhs, typename Dest> + static void run(const Lhs &lhs, const Rhs &rhs, Dest& dest, const typename Dest::Scalar& alpha) { - typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType; - typedef typename ProductType::LhsScalar LhsScalar; - typedef typename ProductType::RhsScalar RhsScalar; - typedef typename ProductType::Scalar ResScalar; - typedef typename ProductType::Index Index; - typedef typename ProductType::ActualLhsType ActualLhsType; - typedef typename ProductType::ActualRhsType ActualRhsType; - typedef typename ProductType::_ActualRhsType _ActualRhsType; - typedef typename ProductType::LhsBlasTraits LhsBlasTraits; - typedef typename ProductType::RhsBlasTraits RhsBlasTraits; - - typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs()); - typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs()); - - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs()) - * RhsBlasTraits::extractScalarFactor(prod.rhs()); + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar ResScalar; + + typedef internal::blas_traits<Lhs> LhsBlasTraits; + typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; + typedef internal::blas_traits<Rhs> RhsBlasTraits; + typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType; + typedef typename internal::remove_all<ActualRhsType>::type ActualRhsTypeCleaned; + + typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); + typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); + + ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) + * RhsBlasTraits::extractScalarFactor(rhs); enum { - DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 }; - gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs; + gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs; ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(), DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data()); @@ -322,12 +310,12 @@ template<> struct trmv_selector<RowMajor> if(!DirectlyUseRhs) { #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - int size = actualRhs.size(); + Index size = actualRhs.size(); EIGEN_DENSE_STORAGE_CTOR_PLUGIN #endif - Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; + Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs; } - + internal::triangular_matrix_vector_product <Index,Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, diff --git a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 09f110da7..07bf26ce5 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_MKL.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -25,13 +25,13 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix-vector product functionality based on ?TRMV. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H -#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H +#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H namespace Eigen { @@ -47,7 +47,7 @@ template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename Rh struct triangular_matrix_vector_product_trmv : triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {}; -#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \ +#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \ static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ @@ -65,13 +65,13 @@ struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs } \ }; -EIGEN_MKL_TRMV_SPECIALIZE(double) -EIGEN_MKL_TRMV_SPECIALIZE(float) -EIGEN_MKL_TRMV_SPECIALIZE(dcomplex) -EIGEN_MKL_TRMV_SPECIALIZE(scomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(double) +EIGEN_BLAS_TRMV_SPECIALIZE(float) +EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) +EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ enum { \ @@ -105,17 +105,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = 'N'; \ @@ -123,40 +121,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ - typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size*lda; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_CM(double, double, d, d) -EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_CM(float, float, f, s) -EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_CM(double, double, d, d) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_CM(float, float, f, s) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ enum { \ @@ -190,17 +187,15 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, /* Square part handling */\ \ char trans, uplo, diag; \ - MKL_INT m, n, lda, incx, incy; \ + BlasIndex m, n, lda, incx, incy; \ EIGTYPE const *a; \ - MKLTYPE alpha_, beta_; \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \ - assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \ + EIGTYPE beta(1); \ \ /* Set m, n */ \ - n = (MKL_INT)size; \ - lda = lhsStride; \ + n = convert_index<BlasIndex>(size); \ + lda = convert_index<BlasIndex>(lhsStride); \ incx = 1; \ - incy = resIncr; \ + incy = convert_index<BlasIndex>(resIncr); \ \ /* Set uplo, trans and diag*/ \ trans = ConjLhs ? 'C' : 'T'; \ @@ -208,40 +203,39 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \ + BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \ -/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \ + BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ +/* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ - typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ if (size<rows) { \ y = _res + size*resIncr; \ a = _lhs + size*lda; \ - m = rows-size; \ - n = size; \ + m = convert_index<BlasIndex>(rows-size); \ + n = convert_index<BlasIndex>(size); \ } \ else { \ x += size; \ y = _res; \ a = _lhs + size; \ - m = size; \ - n = cols-size; \ + m = convert_index<BlasIndex>(size); \ + n = convert_index<BlasIndex>(cols-size); \ } \ - MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \ + BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_MKL_TRMV_RM(double, double, d, d) -EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z) -EIGEN_MKL_TRMV_RM(float, float, f, s) -EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c) +EIGEN_BLAS_TRMV_RM(double, double, d, d) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) +EIGEN_BLAS_TRMV_RM(float, float, f, s) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) } // end namespase internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H +#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index f103eae72..223c38b86 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -52,10 +52,14 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju level3_blocking<Scalar,Scalar>& blocking) { Index cols = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride); + + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper; + TriMapper tri(_tri, triStride); + OtherMapper other(_other, otherStride); typedef gebp_traits<Scalar,Scalar> Traits; + enum { SmallPanelWidth = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr), IsLower = (Mode&Lower) == Lower @@ -66,22 +70,20 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju std::size_t sizeA = kc*mc; std::size_t sizeB = kc*cols; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); conj_if<Conjugate> conj; - gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; - gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs; + gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; + gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache // coherence when accessing the rhs elements - std::ptrdiff_t l1, l2; - manage_caching_sizes(GetAction, &l1, &l2); - Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0; + std::ptrdiff_t l1, l2, l3; + manage_caching_sizes(GetAction, &l1, &l2, &l3); + Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * std::max<Index>(otherStride,size)) : 0; subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr); for(Index k2=IsLower ? 0 : size; @@ -115,8 +117,9 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju { // TODO write a small kernel handling this (can be shared with trsv) Index i = IsLower ? k2+k1+k : k2-k1-k-1; - Index s = IsLower ? k2+k1 : i+1; Index rs = actualPanelWidth - k - 1; // remaining size + Index s = TriStorageOrder==RowMajor ? (IsLower ? k2+k1 : i+1) + : IsLower ? i+1 : i-rs; Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i)); for (Index j=j2; j<j2+actual_cols; ++j) @@ -133,7 +136,6 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju } else { - Index s = IsLower ? i+1 : i-rs; Scalar b = (other(i,j) *= a); Scalar* r = &other(s,j); const Scalar* l = &tri(s,i); @@ -148,17 +150,17 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju Index blockBOffset = IsLower ? k1 : lengthTarget; // update the respective rows of B from other - pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset); + pack_rhs(blockB+actual_kc*j2, other.getSubMapper(startBlock,j2), actualPanelWidth, actual_cols, actual_kc, blockBOffset); // GEBP if (lengthTarget>0) { Index startTarget = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc; - pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget); + pack_lhs(blockA, tri.getSubMapper(startTarget,startBlock), actualPanelWidth, lengthTarget); - gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), - actualPanelWidth, actual_kc, 0, blockBOffset, blockW); + gebp_kernel(other.getSubMapper(startTarget,j2), blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1), + actualPanelWidth, actual_kc, 0, blockBOffset); } } } @@ -172,16 +174,16 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju const Index actual_mc = (std::min)(mc,end-i2); if (actual_mc>0) { - pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc); + pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2-kc), actual_kc, actual_mc); - gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW); + gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0); } } } } } -/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right +/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right */ template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> @@ -200,8 +202,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj level3_blocking<Scalar,Scalar>& blocking) { Index rows = otherSize; - const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride); - blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride); + typedef typename NumTraits<Scalar>::Real RealScalar; + + typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper; + typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; + LhsMapper lhs(_other, otherStride); + RhsMapper rhs(_tri, triStride); typedef gebp_traits<Scalar,Scalar> Traits; enum { @@ -215,17 +221,15 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj std::size_t sizeA = kc*mc; std::size_t sizeB = kc*size; - std::size_t sizeW = kc*Traits::WorkSpaceFactor; ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW()); conj_if<Conjugate> conj; - gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; - gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; + gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; + gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2<size; @@ -238,7 +242,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc; Scalar* geb = blockB+actual_kc*actual_kc; - if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs); + if (rs>0) pack_rhs(geb, rhs.getSubMapper(actual_k2,startPanel), actual_kc, rs); // triangular packing (we only pack the panels off the diagonal, // neglecting the blocks overlapping the diagonal @@ -252,7 +256,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj if (panelLength>0) pack_rhs_panel(blockB+j2*actual_kc, - &rhs(actual_k2+panelOffset, actual_j2), triStride, + rhs.getSubMapper(actual_k2+panelOffset, actual_j2), panelLength, actualPanelWidth, actual_kc, panelOffset); } @@ -280,13 +284,12 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj // GEBP if(panelLength>0) { - gebp_kernel(&lhs(i2,absolute_j2), otherStride, + gebp_kernel(lhs.getSubMapper(i2,absolute_j2), blockA, blockB+j2*actual_kc, actual_mc, panelLength, actualPanelWidth, Scalar(-1), actual_kc, actual_kc, // strides - panelOffset, panelOffset, // offsets - blockW); // workspace + panelOffset, panelOffset); // offsets } // unblocked triangular solve @@ -302,22 +305,25 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj for (Index i=0; i<actual_mc; ++i) r[i] -= a[i] * b; } - Scalar b = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(rhs(j,j)); - for (Index i=0; i<actual_mc; ++i) - r[i] *= b; + if((Mode & UnitDiag)==0) + { + Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j)); + for (Index i=0; i<actual_mc; ++i) + r[i] *= inv_rjj; + } } // pack the just computed part of lhs to A - pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride, + pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride), actualPanelWidth, actual_mc, actual_kc, j2); } } if (rs>0) - gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb, + gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), - -1, -1, 0, 0, blockW); + -1, -1, 0, 0); } } } diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 6a0bb8339..88c0fb794 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -25,20 +25,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************** - * Content : Eigen bindings to Intel(R) MKL + * Content : Eigen bindings to BLAS F77 * Triangular matrix * matrix product functionality based on ?TRMM. ******************************************************************************** */ -#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H -#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H +#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -53,13 +53,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = size, n = otherSize, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -75,25 +73,25 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ } \ }; -EIGEN_MKL_TRSM_L(double, double, d) -EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_L(float, float, s) -EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_L(double, double, d) +EIGEN_BLAS_TRSM_L(dcomplex, double, z) +EIGEN_BLAS_TRSM_L(float, float, s) +EIGEN_BLAS_TRSM_L(scomplex, float, c) // implements RightSide general * op(triangular)^-1 -#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \ +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \ { \ @@ -108,13 +106,11 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag const EIGTYPE* _tri, Index triStride, \ EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ - MKL_INT m = otherSize, n = size, lda, ldb; \ + BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ - MKLTYPE alpha; \ - EIGTYPE myone(1); \ - assign_scalar_eig2mkl(alpha, myone); \ - ldb = otherStride;\ + EIGTYPE alpha(1); \ + ldb = convert_index<BlasIndex>(otherStride);\ \ const EIGTYPE *a; \ /* Set trans */ \ @@ -130,26 +126,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag if (conjA) { \ a_tmp = tri.conjugate(); \ a = a_tmp.data(); \ - lda = a_tmp.outerStride(); \ + lda = convert_index<BlasIndex>(a_tmp.outerStride()); \ } else { \ a = _tri; \ - lda = triStride; \ + lda = convert_index<BlasIndex>(triStride); \ } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \ + BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ /*std::cout << "TRMS_L specialization!\n";*/ \ } \ }; -EIGEN_MKL_TRSM_R(double, double, d) -EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z) -EIGEN_MKL_TRSM_R(float, float, s) -EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c) +EIGEN_BLAS_TRSM_R(double, double, d) +EIGEN_BLAS_TRSM_R(dcomplex, double, z) +EIGEN_BLAS_TRSM_R(float, float, s) +EIGEN_BLAS_TRSM_R(scomplex, float, c) } // end namespace internal } // end namespace Eigen -#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H +#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index ce4d10088..b994759b2 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -10,7 +10,7 @@ #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -25,7 +25,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Co >::run(size, _lhs, lhsStride, rhs); } }; - + // forward and backward substitution, row-major, rhs is a vector template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate> struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> @@ -37,6 +37,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + + typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; + typename internal::conditional< Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, @@ -58,10 +62,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; - general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,Conjugate,RhsScalar,RhsMapper,false>::run( actualPanelWidth, r, - &lhs.coeffRef(startRow,startCol), lhsStride, - rhs + startCol, 1, + LhsMapper(&lhs.coeffRef(startRow,startCol), lhsStride), + RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1)); } @@ -72,7 +76,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con Index s = IsLower ? pi : i+1; if (k>0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); - + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } @@ -91,6 +95,8 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap; const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride)); + typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper; + typedef const_blas_data_mapper<RhsScalar,Index,ColMajor> RhsMapper; typename internal::conditional<Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>, const LhsMap& @@ -122,10 +128,10 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con // let's directly call the low level product function because: // 1 - it is faster to compile // 2 - it is slighlty faster at runtime - general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run( + general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( r, actualPanelWidth, - &lhs.coeffRef(endBlock,startBlock), lhsStride, - rhs+startBlock, 1, + LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), + RhsMapper(rhs+startBlock, 1), rhs+endBlock, 1, RhsScalar(-1)); } } |