diff options
Diffstat (limited to 'Eigen/src/Core/products')
21 files changed, 1977 insertions, 1367 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce5..f35b760c1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,7 +15,13 @@ namespace Eigen { namespace internal { -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false> +enum GEBPPacketSizeType { + GEBPPacketFull = 0, + GEBPPacketHalf, + GEBPPacketQuarter +}; + +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull> class gebp_traits; @@ -25,16 +31,42 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } +#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) + +#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE +#else +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val +#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) + #if EIGEN_ARCH_i386_OR_x86_64 -const std::ptrdiff_t defaultL1CacheSize = 32*1024; -const std::ptrdiff_t defaultL2CacheSize = 256*1024; -const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); +#elif EIGEN_ARCH_PPC +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); #else -const std::ptrdiff_t defaultL1CacheSize = 16*1024; -const std::ptrdiff_t defaultL2CacheSize = 512*1024; -const std::ptrdiff_t defaultL3CacheSize = 512*1024; +const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); +const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); #endif +#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE +#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE + /** \internal */ struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { @@ -50,7 +82,6 @@ struct CacheSizes { std::ptrdiff_t m_l3; }; - /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { @@ -101,6 +132,16 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); + #ifdef EIGEN_VECTORIZE_AVX512 + // We need to find a rationale for that, but without this adjustment, + // performance with AVX512 is pretty bad, like -20% slower. + // One reason is that with increasing packet-size, the blocking size k + // has to become pretty small if we want that 1 lhs panel fit within L1. + // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: + // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. + // This is quite small for a good reuse of the accumulation registers. + l1 *= 4; + #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -115,7 +156,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320); + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320)); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -307,35 +349,60 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - - // FIXME (a bit overkill maybe ?) - - template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; - - template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; +template <typename RhsPacket, typename RhsPacketx4, int registers_taken> +struct RhsPanelHelper { + private: + static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; + public: + typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type; +}; - template<typename CJ, typename A, typename B, typename C, typename T> - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t); - } +template <typename Packet> +struct QuadPacket +{ + Packet B_0, B1, B2, B3; + const Packet& get(const FixedInt<0>&) const { return B_0; } + const Packet& get(const FixedInt<1>&) const { return B1; } + const Packet& get(const FixedInt<2>&) const { return B2; } + const Packet& get(const FixedInt<3>&) const { return B3; } +}; - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif +template <int N, typename T1, typename T2, typename T3> +struct packet_conditional { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; }; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + +#define PACKET_DECL_COND(name, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + name ## Packet + +#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + prefix ## ScalarPacket + +#define PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional<packet_size, \ + typename packet_traits<Scalar>::type, \ + typename packet_traits<Scalar>::half, \ + typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \ + ScalarPacket /* Vectorization logic * real*real: unpack rhs to constant packets, ... @@ -347,7 +414,7 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs> +template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> class gebp_traits { public: @@ -355,13 +422,17 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -370,10 +441,12 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) - // we assume 16 registers +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ + && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) + // we assume 16 registers or more // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. + // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -383,37 +456,41 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) - { - pbroadcast4(b, b0, b1, b2, b3); - } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - + template<typename RhsPacketType> EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1<RhsPacketType>(*b); } - + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + { + } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); @@ -431,8 +508,8 @@ public: dest = ploadu<LhsPacketType>(a); } - template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType> - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const + template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -447,6 +524,12 @@ public: #endif } + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -460,21 +543,25 @@ public: }; -template<typename RealScalar, bool _ConjLhs> -class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false> +template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize> { public: typedef std::complex<RealScalar> LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -489,13 +576,12 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; @@ -504,42 +590,64 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1<RhsPacket>(*b); + dest = pset1<RhsPacketType>(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1<RhsPacket>(*b); + loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { - dest = pload<LhsPacket>(a); + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; + dest = ploadquad<RhsPacket>(tmp); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { - dest = ploadu<LhsPacket>(a); + eigen_internal_assert(RhsPacketSize<=8); + dest = pset1<RhsPacket>(*b); } - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + dest = pload<LhsPacket>(a); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// pbroadcast2(b, b0, b1); -// } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + { + dest = ploadu<LhsPacketType>(a); + } + + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -554,13 +662,20 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const + { + conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj; r = cj.pmadd(c,alpha,r); } protected: - conj_helper<ResPacket,ResPacket,ConjLhs,false> cj; }; template<typename Packet> @@ -579,13 +694,57 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack return res; } +// note that for DoublePacket<RealPacket> the "4" in "downto4" +// corresponds to the number of complexes, so it means "8" +// it terms of real coefficients. + template<typename Packet> -const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a) +const DoublePacket<Packet>& +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0) { return a; } -template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; }; +template<typename Packet> +DoublePacket<typename unpacket_traits<Packet>::half> +predux_half_dowto4(const DoublePacket<Packet> &a, + typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0) +{ + // yes, that's pretty hackish :( + DoublePacket<typename unpacket_traits<Packet>::half> res; + typedef std::complex<typename unpacket_traits<Packet>::type> Cplx; + typedef typename packet_traits<Cplx>::type CplxPacket; + res.first = predux_half_dowto4(CplxPacket(a.first)).v; + res.second = predux_half_dowto4(CplxPacket(a.second)).v; + return res; +} + +// same here, "quad" actually means "8" in terms of real coefficients +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0) +{ + dest.first = pset1<RealPacket>(numext::real(*b)); + dest.second = pset1<RealPacket>(numext::imag(*b)); +} + +template<typename Scalar, typename RealPacket> +void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest, + typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0) +{ + // yes, that's pretty hackish too :( + typedef typename NumTraits<Scalar>::Real RealScalar; + RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; + RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; + dest.first = ploadquad<RealPacket>(r); + dest.second = ploadquad<RealPacket>(i); +} + + +template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { + typedef DoublePacket<typename unpacket_traits<Packet>::half> half; +}; // template<typename Packet> // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b) // { @@ -595,8 +754,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typede // return res; // } -template<typename RealScalar, bool _ConjLhs, bool _ConjRhs> -class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs > +template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -604,15 +763,21 @@ public: typedef std::complex<RealScalar> RhsScalar; typedef std::complex<RealScalar> ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND(Real, _PacketSize); + PACKET_DECL_COND_SCALAR(_PacketSize); + enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - RealPacketSize = Vectorizable ? packet_traits<RealScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, + Vectorizable = unpacket_traits<RealPacket>::vectorizable + && unpacket_traits<ScalarPacket>::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -622,14 +787,16 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<RealScalar>::type RealPacket; - typedef typename packet_traits<Scalar>::type ScalarPacket; - typedef DoublePacket<RealPacket> DoublePacketType; + typedef DoublePacket<RealPacket> DoublePacketType; + typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing; typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket; typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket; typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket; + + // this actualy holds 8 packets! + typedef QuadPacket<RhsPacket> RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -640,51 +807,49 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { - dest = pset1<ResPacket>(*b); + dest = pset1<ScalarPacket>(*b); } // Vectorized path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const + template<typename RealPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - dest.first = pset1<RealPacket>(real(*b)); - dest.second = pset1<RealPacket>(imag(*b)); + dest.first = pset1<RealPacketType>(numext::real(*b)); + dest.second = pset1<RealPacketType>(numext::imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b,dest); + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { - eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4); - loadRhs(b,dest); + loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + // Vectorized path + template<typename RealPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); - loadRhs(b+2, b2); - loadRhs(b+3, b3); + loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - // Vectorized path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadRhs(b,dest); } - - // Scalar path - EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - // FIXME not sure that's the best way to implement it! - loadRhs(b+0, b0); - loadRhs(b+1, b1); + loadQuadToDoublePacket(b,dest); } // nothing special here @@ -693,47 +858,59 @@ public: dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a)); + dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a)); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const + template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType> + EIGEN_STRONG_INLINE + typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type + madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const + template<typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a,b,c); } + + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const + template<typename RealPacketType, typename ResPacketType> + EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const { // assemble c - ResPacket tmp; + ResPacketType tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pcplxflip(pconj(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacket(c.second))); - tmp = padd(ResPacket(c.first),tmp); + tmp = pconj(pcplxflip(ResPacketType(c.second))); + tmp = padd(ResPacketType(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = padd(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = padd(pconj(ResPacketType(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacket(c.second)); - tmp = psub(pconj(ResPacket(c.first)),tmp); + tmp = pcplxflip(ResPacketType(c.second)); + tmp = psub(pconj(ResPacketType(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -743,8 +920,8 @@ protected: conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj; }; -template<typename RealScalar, bool _ConjRhs> -class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs > +template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize> +class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize > { public: typedef std::complex<RealScalar> Scalar; @@ -752,14 +929,25 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); + PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); + +#undef PACKET_DECL_COND_SCALAR_PREFIX +#undef PACKET_DECL_COND_PREFIX +#undef PACKET_DECL_COND_SCALAR +#undef PACKET_DECL_COND + enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = packet_traits<RealScalar>::Vectorizable - && packet_traits<Scalar>::Vectorizable, - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1, + Vectorizable = unpacket_traits<_RealPacket>::vectorizable + && unpacket_traits<_ScalarPacket>::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -770,14 +958,11 @@ public: RhsProgress = 1 }; - typedef typename packet_traits<LhsScalar>::type _LhsPacket; - typedef typename packet_traits<RhsScalar>::type _RhsPacket; - typedef typename packet_traits<ResScalar>::type _ResPacket; - typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; - + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket<RhsPacket> RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -785,22 +970,25 @@ public: p = pset1<ResPacket>(ResScalar(0)); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1<RhsPacket>(*b); + dest = pset1<RhsPacketType>(*b); } - - void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, b0, b1, b2, b3); + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } - -// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) -// { -// // FIXME not sure that's the best way to implement it! -// b0 = pload1<RhsPacket>(b+0); -// b1 = pload1<RhsPacket>(b+1); -// } + + template<typename RhsPacketType> + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const + { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const + {} EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -809,21 +997,23 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4); - loadRhs(b,dest); + dest = ploadquad<RhsPacket>(b); } - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const + template<typename LhsPacketType> + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploaddup<LhsPacket>(a); + dest = ploaddup<LhsPacketType>(a); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const { madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type()); } - EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const + template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -839,16 +1029,24 @@ public: c += a * b; } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const + template<typename LhsPacketType, typename AccPacketType, typename LaneIdType> + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + { + madd(a, b.get(lane), c, tmp, lane); + } + + template <typename ResPacketType, typename AccPacketType> + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj; r = cj.pmadd(alpha,c,r); } protected: - conj_helper<ResPacket,ResPacket,false,ConjRhs> cj; + }; -/* optimized GEneral packed Block * packed Panel product kernel +/* optimized General packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -858,26 +1056,47 @@ protected: template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> struct gebp_kernel { - typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits; + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits; + typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; + typedef typename Traits::RhsPacketx4 RhsPacketx4; + + typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15; + + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; - typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; + typedef typename HalfTraits::AccPacket AccPacketHalf; + + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; + typedef typename QuarterTraits::AccPacket AccPacketQuarter; + typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, + LhsProgressHalf = HalfTraits::LhsProgress, + LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, + RhsProgressHalf = HalfTraits::RhsProgress, + RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -887,6 +1106,299 @@ struct gebp_kernel Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs, +int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress> +struct last_row_process_16_packets +{ + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + EIGEN_UNUSED_VARIABLE(res); + EIGEN_UNUSED_VARIABLE(straits); + EIGEN_UNUSED_VARIABLE(blA); + EIGEN_UNUSED_VARIABLE(blB); + EIGEN_UNUSED_VARIABLE(depth); + EIGEN_UNUSED_VARIABLE(endk); + EIGEN_UNUSED_VARIABLE(i); + EIGEN_UNUSED_VARIABLE(j2); + EIGEN_UNUSED_VARIABLE(alpha); + EIGEN_UNUSED_VARIABLE(C0); + } +}; + + +template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> +struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> { + typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits; + typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits; + + typedef typename Traits::ResScalar ResScalar; + typedef typename SwappedTraits::LhsPacket SLhsPacket; + typedef typename SwappedTraits::RhsPacket SRhsPacket; + typedef typename SwappedTraits::ResPacket SResPacket; + typedef typename SwappedTraits::AccPacket SAccPacket; + + EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, + const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, + ResScalar alpha, SAccPacket &C0) + { + typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter; + typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter; + + SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2); + SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha); + + if (depth - endk > 0) + { + // We have to handle the last row(s) of the rhs, which + // correspond to a half-packet + SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); + + for (Index kk = endk; kk < depth; kk++) + { + SLhsPacketQuarter a0; + SRhsPacketQuarter b0; + straits.loadLhsUnaligned(blB, a0); + straits.loadRhs(blA, b0); + straits.madd(a0,b0,c0,b0, fix<0>); + blB += SwappedTraits::LhsProgress/4; + blA += 1; + } + straits.acc(c0, alphav, R); + } + else + { + straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); + } + res.scatterPacket(i, j2, R); + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_one_packet +{ + typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; + + EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); + traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); + traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); + traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); + traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); + traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + __asm__ ("" : "+x,m" (*A0)); + #endif + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } + + EIGEN_STRONG_INLINE void operator()( + const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, + Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, + int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) + { + GEBPTraits traits; + + // loops on each largest micro horizontal panel of lhs + // (LhsProgress x depth) + for(Index i=peelStart; i<peelEnd; i+=LhsProgress) + { + // loops on each largest micro vertical panel of rhs (depth * nr) + for(Index j2=0; j2<packet_cols4; j2+=nr) + { + // We select a LhsProgress x nr micro block of res + // which is entirely stored into 1 x nr registers. + + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0, C1, C2, C3; + traits.initAcc(C0); + traits.initAcc(C1); + traits.initAcc(C2); + traits.initAcc(C3); + // To improve instruction pipelining, let's double the accumulation registers: + // even k will accumulate in C*, while odd k will accumulate in D*. + // This trick is crutial to get good performance with FMA, otherwise it is + // actually faster to perform separated MUL+ADD because of a naturally + // better instruction-level parallelism. + AccPacket D0, D1, D2, D3; + traits.initAcc(D0); + traits.initAcc(D1); + traits.initAcc(D2); + traits.initAcc(D3); + + LinearMapper r0 = res.getLinearMapper(i, j2 + 0); + LinearMapper r1 = res.getLinearMapper(i, j2 + 1); + LinearMapper r2 = res.getLinearMapper(i, j2 + 2); + LinearMapper r3 = res.getLinearMapper(i, j2 + 3); + + r0.prefetch(prefetch_res_offset); + r1.prefetch(prefetch_res_offset); + r2.prefetch(prefetch_res_offset); + r3.prefetch(prefetch_res_offset); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; + prefetch(&blB[0]); + LhsPacket A0, A1; + + for(Index k=0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4"); + RhsPacketx4 rhs_panel; + RhsPacket T0; + + internal::prefetch(blB+(48+0)); + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + internal::prefetch(blB+(48+16)); + peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3); + + blB += pk*4*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4"); + } + C0 = padd(C0,D0); + C1 = padd(C1,D1); + C2 = padd(C2,D2); + C3 = padd(C3,D3); + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacketx4 rhs_panel; + RhsPacket T0; + peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3); + blB += 4*RhsProgress; + blA += LhsProgress; + } + + ResPacket R0, R1; + ResPacket alphav = pset1<ResPacket>(alpha); + + R0 = r0.template loadPacket<ResPacket>(0); + R1 = r1.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0, R0); + r1.storePacket(0, R1); + + R0 = r2.template loadPacket<ResPacket>(0); + R1 = r3.template loadPacket<ResPacket>(0); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0, R0); + r3.storePacket(0, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2<cols; j2++) + { + // One column at a time + const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)]; + prefetch(&blA[0]); + + // gets res block as register + AccPacket C0; + traits.initAcc(C0); + + LinearMapper r0 = res.getLinearMapper(i, j2); + + // performs "inner" products + const RhsScalar* blB = &blockB[j2*strideB+offsetB]; + LhsPacket A0; + + for(Index k= 0; k<peeled_kc; k+=pk) + { + EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1"); + RhsPacket B_0; + +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \ + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ + /* FIXME: why unaligned???? */ \ + traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ + } while(false); + + EIGEN_GEBGP_ONESTEP(0); + EIGEN_GEBGP_ONESTEP(1); + EIGEN_GEBGP_ONESTEP(2); + EIGEN_GEBGP_ONESTEP(3); + EIGEN_GEBGP_ONESTEP(4); + EIGEN_GEBGP_ONESTEP(5); + EIGEN_GEBGP_ONESTEP(6); + EIGEN_GEBGP_ONESTEP(7); + + blB += pk*RhsProgress; + blA += pk*LhsProgress; + + EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); + } + + // process remaining peeled loop + for(Index k=peeled_kc; k<depth; k++) + { + RhsPacket B_0; + EIGEN_GEBGP_ONESTEP(0); + blB += RhsProgress; + blA += LhsProgress; + } +#undef EIGEN_GEBGP_ONESTEP + ResPacket R0; + ResPacket alphav = pset1<ResPacket>(alpha); + R0 = r0.template loadPacket<ResPacket>(0); + traits.acc(C0, alphav, R0); + r0.storePacket(0, R0); + } + } + } +}; + +template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper> +struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> +{ + +EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) + { + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); + EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); + traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); + traits.madd(*A0, *B_0, *C0, *B_0); + traits.madd(*A0, *B1, *C1, *B1); + traits.madd(*A0, *B2, *C2, *B2); + traits.madd(*A0, *B3, *C3, *B3); + EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); + } +}; + template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs> EIGEN_DONT_INLINE void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs> @@ -903,10 +1415,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; + const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; + const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const Index prefetch_res_offset = 32/sizeof(ResScalar); + const int prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -964,36 +1478,48 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4"); - RhsPacket B_0, T0; + // 15 registers are taken (12 for acc, 2 for lhs). + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; - -#define EIGEN_GEBP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ + #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0)) + // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633 + // without this workaround A0, A1, and A2 are loaded in the same register, + // which is not good for pipelining + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__ ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2)); + #else + #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND + #endif +#define EIGEN_GEBP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - internal::prefetch(blA+(3*K+16)*LhsProgress); \ - if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, T0); \ - traits.madd(A2, B_0, C8, B_0); \ - traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C1, T0); \ - traits.madd(A1, B_0, C5, T0); \ - traits.madd(A2, B_0, C9, B_0); \ - traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C2, T0); \ - traits.madd(A1, B_0, C6, T0); \ - traits.madd(A2, B_0, C10, B_0); \ - traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ - traits.madd(A0, B_0, C3 , T0); \ - traits.madd(A1, B_0, C7, T0); \ - traits.madd(A2, B_0, C11, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ - } while(false) + internal::prefetch(blA + (3 * K + 16) * LhsProgress); \ + if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \ + internal::prefetch(blB + (4 * K + 16) * RhsProgress); \ + } /* Bug 953 */ \ + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \ + traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ + traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ + traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ + traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while (false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1013,7 +1539,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, T0; + RhsPanel15 rhs_panel; + RhsPacket T0; LhsPacket A2; EIGEN_GEBP_ONESTEP(0); blB += 4*RhsProgress; @@ -1025,9 +1552,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1035,9 +1562,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga r0.storePacket(1 * Traits::ResPacketSize, R1); r0.storePacket(2 * Traits::ResPacketSize, R2); - R0 = r1.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(2 * Traits::ResPacketSize); + R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1045,9 +1572,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga r1.storePacket(1 * Traits::ResPacketSize, R1); r1.storePacket(2 * Traits::ResPacketSize, R2); - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r2.loadPacket(1 * Traits::ResPacketSize); - R2 = r2.loadPacket(2 * Traits::ResPacketSize); + R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1055,9 +1582,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga r2.storePacket(1 * Traits::ResPacketSize, R1); r2.storePacket(2 * Traits::ResPacketSize, R2); - R0 = r3.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(1 * Traits::ResPacketSize); - R2 = r3.loadPacket(2 * Traits::ResPacketSize); + R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1093,20 +1620,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1"); RhsPacket B_0; -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \ EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ - traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A2, B_0, C8, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ - } while(false) - + traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \ + traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \ + traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A2, B_0, C8, B_0, fix<0>); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while (false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1116,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); - blB += pk*RhsProgress; - blA += pk*3*Traits::LhsProgress; + blB += int(pk) * int(RhsProgress); + blA += int(pk) * 3 * int(Traits::LhsProgress); EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1"); } @@ -1134,9 +1661,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga ResPacket R0, R1, R2; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1195,26 +1722,34 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index k=0; k<peeled_kc; k+=pk) { EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4"); - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; + + // NOTE: the begin/end asm comments below work around bug 935! + // but they are not enough for gcc>=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif +#define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \ + traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \ + traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \ + traits.madd(A0, rhs_panel, C0, T0, fix<0>); \ + traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ + traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ + traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ + traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ + traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ + traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ + traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while (false) - #define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ - traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, T0); \ - traits.madd(A1, B_0, C4, B_0); \ - traits.madd(A0, B1, C1, T0); \ - traits.madd(A1, B1, C5, B1); \ - traits.madd(A0, B2, C2, T0); \ - traits.madd(A1, B2, C6, B2); \ - traits.madd(A0, B3, C3, T0); \ - traits.madd(A1, B3, C7, B3); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while(false) - internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1234,7 +1769,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // process remaining peeled loop for(Index k=peeled_kc; k<depth; k++) { - RhsPacket B_0, B1, B2, B3, T0; + RhsPacketx4 rhs_panel; + RhsPacket T0; EIGEN_GEBGP_ONESTEP(0); blB += 4*RhsProgress; blA += 2*Traits::LhsProgress; @@ -1244,10 +1780,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga ResPacket R0, R1, R2, R3; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1257,10 +1793,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga r1.storePacket(0 * Traits::ResPacketSize, R2); r1.storePacket(1 * Traits::ResPacketSize, R3); - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r2.loadPacket(1 * Traits::ResPacketSize); - R2 = r3.loadPacket(0 * Traits::ResPacketSize); - R3 = r3.loadPacket(1 * Traits::ResPacketSize); + R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1305,8 +1841,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B1); \ - traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B_0, C0, B1, fix<0>); \ + traits.madd(A1, B_0, C4, B_0, fix<0>); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -1319,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga EIGEN_GEBGP_ONESTEP(6); EIGEN_GEBGP_ONESTEP(7); - blB += pk*RhsProgress; - blA += pk*2*Traits::LhsProgress; + blB += int(pk) * int(RhsProgress); + blA += int(pk) * 2 * int(Traits::LhsProgress); EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1"); } @@ -1337,8 +1873,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga ResPacket R0, R1; ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1350,186 +1886,43 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga //---------- Process 1 * LhsProgress rows at once ---------- if(mr>=1*Traits::LhsProgress) { - // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) - for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress) - { - // loops on each largest micro vertical panel of rhs (depth * nr) - for(Index j2=0; j2<packet_cols4; j2+=nr) - { - // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely - // stored into 1 x nr registers. - - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0, C1, C2, C3; - traits.initAcc(C0); - traits.initAcc(C1); - traits.initAcc(C2); - traits.initAcc(C3); - - LinearMapper r0 = res.getLinearMapper(i, j2 + 0); - LinearMapper r1 = res.getLinearMapper(i, j2 + 1); - LinearMapper r2 = res.getLinearMapper(i, j2 + 2); - LinearMapper r3 = res.getLinearMapper(i, j2 + 3); - - r0.prefetch(prefetch_res_offset); - r1.prefetch(prefetch_res_offset); - r2.prefetch(prefetch_res_offset); - r3.prefetch(prefetch_res_offset); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - prefetch(&blB[0]); - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4"); - RhsPacket B_0, B1, B2, B3; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ - traits.madd(A0, B_0, C0, B_0); \ - traits.madd(A0, B1, C1, B1); \ - traits.madd(A0, B2, C2, B2); \ - traits.madd(A0, B3, C3, B3); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4"); \ - } while(false) - - internal::prefetch(blB+(48+0)); - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - internal::prefetch(blB+(48+16)); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*4*RhsProgress; - blA += pk*1*LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4"); - } - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacket B_0, B1, B2, B3; - EIGEN_GEBGP_ONESTEP(0); - blB += 4*RhsProgress; - blA += 1*LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - - ResPacket R0, R1; - ResPacket alphav = pset1<ResPacket>(alpha); - - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0 * Traits::ResPacketSize, R0); - r1.storePacket(0 * Traits::ResPacketSize, R1); - - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0 * Traits::ResPacketSize, R0); - r3.storePacket(0 * Traits::ResPacketSize, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2<cols; j2++) - { - // One column at a time - const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)]; - prefetch(&blA[0]); - - // gets res block as register - AccPacket C0; - traits.initAcc(C0); - - LinearMapper r0 = res.getLinearMapper(i, j2); - - // performs "inner" products - const RhsScalar* blB = &blockB[j2*strideB+offsetB]; - LhsPacket A0; - - for(Index k=0; k<peeled_kc; k+=pk) - { - EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1"); - RhsPacket B_0; - -#define EIGEN_GEBGP_ONESTEP(K) \ - do { \ - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ - traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0); \ - traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ - traits.madd(A0, B_0, C0, B_0); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1"); \ - } while(false); - - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*RhsProgress; - blA += pk*1*Traits::LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1"); - } - - // process remaining peeled loop - for(Index k=peeled_kc; k<depth; k++) - { - RhsPacket B_0; - EIGEN_GEBGP_ONESTEP(0); - blB += RhsProgress; - blA += 1*Traits::LhsProgress; - } -#undef EIGEN_GEBGP_ONESTEP - ResPacket R0; - ResPacket alphav = pset1<ResPacket>(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - traits.acc(C0, alphav, R0); - r0.storePacket(0 * Traits::ResPacketSize, R0); - } - } + lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressHalf rows at once ---------- + if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) + { + lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + } + //---------- Process LhsProgressQuarter rows at once ---------- + if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) + { + lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p; + p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc1<rows) + if(peeled_mc_quarter<rows) { // loop on each panel of the rhs for(Index j2=0; j2<packet_cols4; j2+=nr) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr]; - // The following piece of code wont work for 512 bit registers - // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size - // as nr (which is currently 4) for the return type. - typedef typename unpacket_traits<SResPacket>::half SResPacketHalf; + // If LhsProgress is 8 or 16, it assumes that there is a + // half or quarter packet, respectively, of the same size as + // nr (which is currently 4) for the return type. + const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size; + const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && - (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits<SResPacketHalf>::size==nr)) + (SwappedTraits::LhsProgress<=16) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && + (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -1552,15 +1945,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadRhsQuad(blA+0*spk, B_0); straits.loadRhsQuad(blA+1*spk, B_1); - straits.madd(A0,B_0,C0,B_0); - straits.madd(A1,B_1,C1,B_1); + straits.madd(A0,B_0,C0,B_0, fix<0>); + straits.madd(A1,B_1,C1,B_1, fix<0>); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0); - straits.madd(A1,B_1,C3,B_1); + straits.madd(A0,B_0,C2,B_0, fix<0>); + straits.madd(A1,B_1,C3,B_1, fix<0>); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -1573,7 +1966,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga straits.loadLhsUnaligned(blB, A0); straits.loadRhsQuad(blA, B_0); - straits.madd(A0,B_0,C0,B_0); + straits.madd(A0,B_0,C0,B_0, fix<0>); blB += SwappedTraits::LhsProgress; blA += spk; @@ -1583,7 +1976,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // Special case where we have to first reduce the accumulation register C0 typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2); @@ -1596,16 +1989,25 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga SRhsPacketHalf b0; straits.loadLhsUnaligned(blB, a0); straits.loadRhs(blA, b0); - SAccPacketHalf c0 = predux_downto4(C0); - straits.madd(a0,b0,c0,b0); + SAccPacketHalf c0 = predux_half_dowto4(C0); + straits.madd(a0,b0,c0,b0, fix<0>); straits.acc(c0, alphav, R); } else { - straits.acc(predux_downto4(C0), alphav, R); + straits.acc(predux_half_dowto4(C0), alphav, R); } res.scatterPacket(i, j2, R); } + else if (SwappedTraits::LhsProgress==16) + { + // Special case where we have to first reduce the + // accumulation register C0. We specialize the block in + // template form, so that LhsProgress < 16 paths don't + // fail to compile + last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p; + p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); + } else { SResPacket R = res.template gatherPacket<SResPacket>(i, j2); @@ -1628,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga B_0 = blB[0]; B_1 = blB[1]; - CJMADD(cj,A0,B_0,C0, B_0); - CJMADD(cj,A0,B_1,C1, B_1); - + C0 = cj.pmadd(A0,B_0,C0); + C1 = cj.pmadd(A0,B_1,C1); + B_0 = blB[2]; B_1 = blB[3]; - CJMADD(cj,A0,B_0,C2, B_0); - CJMADD(cj,A0,B_1,C3, B_1); - + C2 = cj.pmadd(A0,B_0,C2); + C3 = cj.pmadd(A0,B_1,C3); + blB += 4; } res(i, j2 + 0) += alpha * C0; @@ -1649,7 +2051,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga for(Index j2=packet_cols4; j2<cols; j2++) { // loop on each row of the lhs (1*LhsProgress x depth) - for(Index i=peeled_mc1; i<rows; i+=1) + for(Index i=peeled_mc_quarter; i<rows; i+=1) { const LhsScalar* blA = &blockA[i*strideA+offsetA]; prefetch(&blA[0]); @@ -1660,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga { LhsScalar A0 = blA[k]; RhsScalar B_0 = blB[k]; - CJMADD(cj, A0, B_0, C0, B_0); + C0 = cj.pmadd(A0, B_0, C0); } res(i, j2) += alpha * C0; } @@ -1669,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga } -#undef CJMADD - // pack a block of the lhs // The traversal is as follow (mr==4): // 0 4 8 12 ... @@ -1685,19 +2085,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga // // 32 33 34 35 ... // 36 36 38 39 ... -template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits<Scalar>::type Packet; - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1709,9 +2114,12 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 - : Pack2>1 ? (rows/Pack2)*Pack2 : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; + const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter + : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; Index i=0; @@ -1725,9 +2133,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co for(Index k=0; k<depth; k++) { Packet A, B, C; - A = lhs.loadPacket(i+0*PacketSize, k); - B = lhs.loadPacket(i+1*PacketSize, k); - C = lhs.loadPacket(i+2*PacketSize, k); + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); + B = lhs.template loadPacket<Packet>(i+1*PacketSize, k); + C = lhs.template loadPacket<Packet>(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1745,8 +2153,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co for(Index k=0; k<depth; k++) { Packet A, B; - A = lhs.loadPacket(i+0*PacketSize, k); - B = lhs.loadPacket(i+1*PacketSize, k); + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); + B = lhs.template loadPacket<Packet>(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1763,27 +2171,67 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co for(Index k=0; k<depth; k++) { Packet A; - A = lhs.loadPacket(i+0*PacketSize, k); + A = lhs.template loadPacket<Packet>(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack scalars + // Pack half packets + if(HasHalf && Pack1>=HalfPacketSize) + { + for(; i<peeled_mc_half; i+=HalfPacketSize) + { + if(PanelMode) count += (HalfPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + HalfPacket A; + A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=HalfPacketSize; + } + if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); + } + } + // Pack quarter packets + if(HasQuarter && Pack1>=QuarterPacketSize) + { + for(; i<peeled_mc_quarter; i+=QuarterPacketSize) + { + if(PanelMode) count += (QuarterPacketSize) * offset; + + for(Index k=0; k<depth; k++) + { + QuarterPacket A; + A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k); + pstoreu(blockA+count, cj.pconj(A)); + count+=QuarterPacketSize; + } + if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); + } + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). if(Pack2<PacketSize && Pack2>1) { - for(; i<peeled_mc0; i+=Pack2) + for(; i<peeled_mc0; i+=last_lhs_progress) { - if(PanelMode) count += Pack2 * offset; + if(PanelMode) count += last_lhs_progress * offset; for(Index k=0; k<depth; k++) - for(Index w=0; w<Pack2; w++) + for(Index w=0; w<last_lhs_progress; w++) blockA[count++] = cj(lhs(i+w, k)); - if(PanelMode) count += Pack2 * (stride-offset-depth); + if(PanelMode) count += last_lhs_progress * (stride-offset-depth); } } + // Pack scalars for(; i<rows; i++) { if(PanelMode) count += offset; @@ -1793,19 +2241,24 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co } } -template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode> +template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode> +EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits<Scalar>::type Packet; - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; + enum { PacketSize = unpacket_traits<Packet>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1813,37 +2266,51 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; Index count = 0; + bool gone_half = false, gone_quarter = false, gone_last = false; -// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; -// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; -// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; - - int pack = Pack1; Index i = 0; + int pack = Pack1; + int psize = PacketSize; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = i+(remaining_rows/pack)*pack; + Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; + Index starting_pos = i; for(; i<peeled_mc; i+=pack) { if(PanelMode) count += pack * offset; - const Index peeled_k = (depth/PacketSize)*PacketSize; Index k=0; - if(pack>=PacketSize) + if(pack>=psize && psize >= QuarterPacketSize) { - for(; k<peeled_k; k+=PacketSize) + const Index peeled_k = (depth/psize)*psize; + for(; k<peeled_k; k+=psize) { - for (Index m = 0; m < pack; m += PacketSize) + for (Index m = 0; m < pack; m += psize) { - PacketBlock<Packet> kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + if (psize == PacketSize) { + PacketBlock<Packet> kernel; + for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + } else if (HasHalf && psize == HalfPacketSize) { + gone_half = true; + PacketBlock<HalfPacket> kernel_half; + for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k); + ptranspose(kernel_half); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + } else if (HasQuarter && psize == QuarterPacketSize) { + gone_quarter = true; + PacketBlock<QuarterPacket> kernel_quarter; + for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k); + ptranspose(kernel_quarter); + for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + } } - count += PacketSize*pack; + count += psize*pack; } } + for(; k<depth; k++) { Index w=0; @@ -1866,9 +2333,28 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro if(PanelMode) count += pack * (stride-offset-depth); } - pack -= PacketSize; - if(pack<Pack2 && (pack+PacketSize)!=Pack2) - pack = Pack2; + pack -= psize; + Index left = rows - i; + if (pack <= 0) { + if (!gone_last && + (starting_pos == i || left >= psize/2 || left >= psize/4) && + ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || + (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { + psize /= 2; + pack = psize; + continue; + } + // Pack2 may be *smaller* than PacketSize—that happens for + // products like real * complex, where we have to go half the + // progress on the lhs in order to duplicate those operands to + // address both real & imaginary parts on the rhs. This portion will + // pack those half ones until they match the number expected on the + // last peeling loop at this point (for the rhs). + if (Pack2 < PacketSize && !gone_last) { + gone_last = true; + psize = pack = left & ~1; + } + } } for(; i<rows; i++) @@ -1924,7 +2410,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co // const Scalar* b6 = &rhs[(j2+6)*rhsStride]; // const Scalar* b7 = &rhs[(j2+7)*rhsStride]; // Index k=0; -// if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4 +// if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4 // { // for(; k<peeled_k; k+=PacketSize) { // PacketBlock<Packet> kernel; @@ -1971,10 +2457,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co { for(; k<peeled_k; k+=PacketSize) { PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket<Packet>(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2015,94 +2501,104 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> { typedef typename packet_traits<Scalar>::type Packet; + typedef typename unpacket_traits<Packet>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket; typedef typename DataMapper::LinearMapper LinearMapper; - enum { PacketSize = packet_traits<Scalar>::size }; - EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); -}; - -template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode> -EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> - ::operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) -{ - EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); - EIGEN_UNUSED_VARIABLE(stride); - EIGEN_UNUSED_VARIABLE(offset); - eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); - conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; - Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; - Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; - Index count = 0; - -// if(nr>=8) -// { -// for(Index j2=0; j2<packet_cols8; j2+=8) -// { -// // skip what we have before -// if(PanelMode) count += 8 * offset; -// for(Index k=0; k<depth; k++) -// { -// if (PacketSize==8) { -// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); -// pstoreu(blockB+count, cj.pconj(A)); -// } else if (PacketSize==4) { -// Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); -// Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); -// pstoreu(blockB+count, cj.pconj(A)); -// pstoreu(blockB+count+PacketSize, cj.pconj(B)); -// } else { -// const Scalar* b0 = &rhs[k*rhsStride + j2]; -// blockB[count+0] = cj(b0[0]); -// blockB[count+1] = cj(b0[1]); -// blockB[count+2] = cj(b0[2]); -// blockB[count+3] = cj(b0[3]); -// blockB[count+4] = cj(b0[4]); -// blockB[count+5] = cj(b0[5]); -// blockB[count+6] = cj(b0[6]); -// blockB[count+7] = cj(b0[7]); -// } -// count += 8; -// } -// // skip what we have after -// if(PanelMode) count += 8 * (stride-offset-depth); -// } -// } - if(nr>=4) + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size}; + EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) { - for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) + EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR"); + EIGEN_UNUSED_VARIABLE(stride); + EIGEN_UNUSED_VARIABLE(offset); + eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride)); + const bool HasHalf = (int)HalfPacketSize < (int)PacketSize; + const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize; + conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj; + Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; + Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; + Index count = 0; + + // if(nr>=8) + // { + // for(Index j2=0; j2<packet_cols8; j2+=8) + // { + // // skip what we have before + // if(PanelMode) count += 8 * offset; + // for(Index k=0; k<depth; k++) + // { + // if (PacketSize==8) { + // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + // pstoreu(blockB+count, cj.pconj(A)); + // } else if (PacketSize==4) { + // Packet A = ploadu<Packet>(&rhs[k*rhsStride + j2]); + // Packet B = ploadu<Packet>(&rhs[k*rhsStride + j2 + PacketSize]); + // pstoreu(blockB+count, cj.pconj(A)); + // pstoreu(blockB+count+PacketSize, cj.pconj(B)); + // } else { + // const Scalar* b0 = &rhs[k*rhsStride + j2]; + // blockB[count+0] = cj(b0[0]); + // blockB[count+1] = cj(b0[1]); + // blockB[count+2] = cj(b0[2]); + // blockB[count+3] = cj(b0[3]); + // blockB[count+4] = cj(b0[4]); + // blockB[count+5] = cj(b0[5]); + // blockB[count+6] = cj(b0[6]); + // blockB[count+7] = cj(b0[7]); + // } + // count += 8; + // } + // // skip what we have after + // if(PanelMode) count += 8 * (stride-offset-depth); + // } + // } + if(nr>=4) { - // skip what we have before - if(PanelMode) count += 4 * offset; - for(Index k=0; k<depth; k++) + for(Index j2=packet_cols8; j2<packet_cols4; j2+=4) { - if (PacketSize==4) { - Packet A = rhs.loadPacket(k, j2); - pstoreu(blockB+count, cj.pconj(A)); - count += PacketSize; - } else { - const LinearMapper dm0 = rhs.getLinearMapper(k, j2); - blockB[count+0] = cj(dm0(0)); - blockB[count+1] = cj(dm0(1)); - blockB[count+2] = cj(dm0(2)); - blockB[count+3] = cj(dm0(3)); - count += 4; + // skip what we have before + if(PanelMode) count += 4 * offset; + for(Index k=0; k<depth; k++) + { + if (PacketSize==4) { + Packet A = rhs.template loadPacket<Packet>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += PacketSize; + } else if (HasHalf && HalfPacketSize==4) { + HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += HalfPacketSize; + } else if (HasQuarter && QuarterPacketSize==4) { + QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2); + pstoreu(blockB+count, cj.pconj(A)); + count += QuarterPacketSize; + } else { + const LinearMapper dm0 = rhs.getLinearMapper(k, j2); + blockB[count+0] = cj(dm0(0)); + blockB[count+1] = cj(dm0(1)); + blockB[count+2] = cj(dm0(2)); + blockB[count+3] = cj(dm0(3)); + count += 4; + } } + // skip what we have after + if(PanelMode) count += 4 * (stride-offset-depth); } - // skip what we have after - if(PanelMode) count += 4 * (stride-offset-depth); } - } - // copy the remaining columns one at a time (nr==1) - for(Index j2=packet_cols4; j2<cols; ++j2) - { - if(PanelMode) count += offset; - for(Index k=0; k<depth; k++) + // copy the remaining columns one at a time (nr==1) + for(Index j2=packet_cols4; j2<cols; ++j2) { - blockB[count] = cj(rhs(k, j2)); - count += 1; + if(PanelMode) count += offset; + for(Index k=0; k<depth; k++) + { + blockB[count] = cj(rhs(k, j2)); + count += 1; + } + if(PanelMode) count += stride-offset-depth; } - if(PanelMode) count += stride-offset-depth; } -} +}; } // end namespace internal diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 41cfb0e03..caa65fccc 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -20,8 +20,9 @@ template<typename _LhsScalar, typename _RhsScalar> class level3_blocking; template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> -struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor> + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride> { typedef gebp_traits<RhsScalar,LhsScalar> Traits; @@ -30,7 +31,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh Index rows, Index cols, Index depth, const LhsScalar* lhs, Index lhsStride, const RhsScalar* rhs, Index rhsStride, - ResScalar* res, Index resStride, + ResScalar* res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking<RhsScalar,LhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) @@ -39,8 +40,8 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh general_matrix_matrix_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, - ColMajor> - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + ColMajor,ResInnerStride> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info); } }; @@ -49,8 +50,9 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> -struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor> + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride> { typedef gebp_traits<LhsScalar,RhsScalar> Traits; @@ -59,23 +61,23 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScala static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resStride, + ResScalar* _res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking<LhsScalar,RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) { typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper; + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; @@ -108,7 +110,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users += threads; + info[tid].users = threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -146,6 +148,9 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i<threads; ++i) +#if !EIGEN_HAS_CXX11_ATOMIC + #pragma omp atomic +#endif info[i].users -= 1; } } @@ -225,7 +230,7 @@ struct gemm_functor Gemm::run(rows, cols, m_lhs.cols(), &m_lhs.coeffRef(row,0), m_lhs.outerStride(), &m_rhs.coeffRef(0,col), m_rhs.outerStride(), - (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } @@ -426,8 +431,14 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> template<typename Dst> static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program + // to determine the following heuristic. + // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, + // unless it has been specialized by the user or for a given architecture. + // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. + // I'm not sure it is still required. + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>()); else { dst.setZero(); @@ -438,8 +449,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> template<typename Dst> static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -447,8 +458,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> template<typename Dst> static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0) + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } @@ -460,11 +471,25 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; + if (dst.cols() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::ColXpr dst_vec(dst.col(0)); + return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); + } + else if (dst.rows() == 1) + { + // Fallback to GEMV if either the lhs or rhs is a runtime vector + typename Dest::RowXpr dst_vec(dst.row(0)); + return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct> + ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); + } + typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs); typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar, Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType; @@ -475,7 +500,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct> Index, LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index e844e37d1..6ba0d9bdb 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -25,51 +25,54 @@ namespace internal { **********************************************************************/ // forward declarations (defined at the end of this file) -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo> struct tribb_kernel; /* Optimized matrix-matrix product evaluating only one triangular half */ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, - int ResStorageOrder, int UpLo, int Version = Specialized> + int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized> struct general_matrix_matrix_triangular_product; // as usual if the result is row major => we transpose the product template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> -struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version> + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version> { typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking) { general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs, LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, - ColMajor, UpLo==Lower?Upper:Lower> - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); + ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking); } }; template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int UpLo, int Version> -struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version> + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version> { typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking) { typedef gebp_traits<LhsScalar,RhsScalar> Traits; typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -84,10 +87,10 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB()); - gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp; - tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb; + tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb; for(Index k2=0; k2<depth; k2+=kc) { @@ -110,8 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha, -1, -1, 0, 0); - - sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); + sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha); if (UpLo==Upper) { @@ -133,7 +135,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder, // while the triangular block overlapping the diagonal is evaluated into a // small temporary buffer which is then accumulated into the result using a // triangular traversal. -template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo> +template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo> struct tribb_kernel { typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits; @@ -142,11 +144,13 @@ struct tribb_kernel enum { BlockSize = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret }; - void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper; - ResMapper res(_res, resStride); - gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel; + typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; + typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper; + ResMapper res(_res, resStride, resIncr); + gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1; + gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2; Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert())); @@ -158,31 +162,32 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0); - + gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // 2 - triangular accumulation for(Index j1=0; j1<actualBlockSize; ++j1) { - ResScalar* r = &res(i, j + j1); + typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1); for(Index i1=UpLo==Lower ? j1 : 0; UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1) - r[i1] += buffer(i1,j1); + r(i1) += buffer(i1,j1); } } if(UpLo==Lower) { Index i = j+actualBlockSize; - gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, - depth, actualBlockSize, alpha, -1, -1, 0, 0); + gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, + depth, actualBlockSize, alpha, -1, -1, 0, 0); } } } @@ -286,23 +291,24 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false> internal::general_matrix_matrix_triangular_product<Index, typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)> + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)> ::run(size, depth, &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), - mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking); + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0), + mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; template<typename MatrixType, unsigned int UpLo> template<typename ProductType> -TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 41e18ff07..9a650ec23 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,10 +37,10 @@ namespace Eigen { namespace internal { -template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo> +template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo> struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< - Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; // try to go to BLAS specialization @@ -48,19 +48,19 @@ struct general_matrix_matrix_rankupdate : template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs, int UpLo> \ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \ - Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \ + Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \ { \ - if ( lhs==rhs && ((UpLo&(Lower|Upper)==UpLo)) ) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product<Index, \ Scalar, LhsStorageOrder, ConjugateLhs, \ Scalar, RhsStorageOrder, ConjugateRhs, \ - ColMajor, UpLo, BuiltIn> \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + ColMajor, 1, UpLo, BuiltIn> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \ } \ } \ }; @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C BlasIndex lda=convert_index<BlasIndex>(lhsStride), ldc=convert_index<BlasIndex>(resStride), n=convert_index<BlasIndex>(size), k=convert_index<BlasIndex>(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,C } \ }; - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk) +EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk) +#else EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_) EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_) +#endif // TODO hanlde complex cases // EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index 7a3bdbf20..71abf4013 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -46,25 +46,27 @@ namespace internal { // gemm specialization -#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASPREFIX) \ +#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC) \ template< \ typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \ +struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \ { \ typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \ \ static void run(Index rows, Index cols, Index depth, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, \ level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \ GemmParallelInfo<Index>* /*info = 0*/) \ { \ using std::conj; \ \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char transa, transb; \ BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -100,13 +102,20 @@ static void run(Index rows, Index cols, Index depth, \ ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 3c1a7fc40..dfb6aebce 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,11 +14,57 @@ namespace Eigen { namespace internal { +enum GEMVPacketSizeType { + GEMVPacketFull = 0, + GEMVPacketHalf, + GEMVPacketQuarter +}; + +template <int N, typename T1, typename T2, typename T3> +struct gemv_packet_cond { typedef T3 type; }; + +template <typename T1, typename T2, typename T3> +struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; }; + +template <typename T1, typename T2, typename T3> +struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; }; + +template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull> +class gemv_traits +{ + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + +#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ + typedef typename gemv_packet_cond<packet_size, \ + typename packet_traits<name ## Scalar>::type, \ + typename packet_traits<name ## Scalar>::half, \ + typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ + prefix ## name ## Packet + + PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); + PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); +#undef PACKET_DECL_COND_PREFIX + +public: + enum { + Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && + unpacket_traits<_RhsPacket>::vectorizable && + int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), + LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 + }; + + typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; + typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; + typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; +}; + + /* Optimized col-major matrix * vector product: - * This algorithm processes 4 columns at onces that allows to both reduce - * the number of load/stores of the result by a factor 4 and to reduce - * the instruction dependency. Moreover, we know that all bands have the - * same alignment pattern. + * This algorithm processes the matrix per vertical panels, + * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -27,56 +73,30 @@ namespace internal { * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * - * Accesses to the matrix coefficients follow the following logic: - * - * - if all columns have the same alignment then - * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise - * - if even columns have the same alignment then - * // odd columns are guaranteed to have the same alignment too - * - if even or odd columns have the same alignment as the result, then - * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) - * - perform half aligned and half unaligned loads (-> EvenAligned case) - * - otherwise perform unaligned loads only (-> NoneAligned case) - * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then - * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, - * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) - * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h - * - otherwise, - * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), - * // we currently fall back to the NoneAligned case - * * The same reasoning apply for the transposed case. - * - * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... - * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment - * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow - * compared to unaligned loads on a 4 byte boundary. - * */ template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { + typedef gemv_traits<LhsScalar,RhsScalar> Traits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; -enum { - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable - && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 -}; + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits<LhsScalar>::type _LhsPacket; -typedef typename packet_traits<RhsScalar>::type _RhsPacket; -typedef typename packet_traits<ResScalar>::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; -typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; -typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -85,244 +105,187 @@ EIGEN_DONT_INLINE static void run( }; template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ - pstore(&res[j], \ - padd(pload<ResPacket>(&res[j]), \ - padd( \ - padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j), ptmp0), \ - pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j), ptmp1)), \ - padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j), ptmp2), \ - pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j), ptmp3)) ))) - - typedef typename LhsMapper::VectorMapper LhsScalars; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; - if(ConjugateRhs) - alpha = numext::conj(alpha); - - enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; - const Index columnsAtOnce = 4; - const Index peels = 2; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index ResPacketAlignedMask = ResPacketSize-1; -// const Index PeelAlignedMask = ResPacketSize*peels-1; - const Index size = rows; + conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; + conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type. - Index alignedStart = internal::first_default_aligned(res,size); - Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(size); - - // find how many columns do we have to skip to be aligned with the result (if possible) - Index skipColumns = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - // Currently, it seems to be better to perform unaligned loads anyway - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + const Index n8 = rows-8*ResPacketSize+1; + const Index n4 = rows-4*ResPacketSize+1; + const Index n3 = rows-3*ResPacketSize+1; + const Index n2 = rows-2*ResPacketSize+1; + const Index n1 = rows-1*ResPacketSize+1; + const Index n_half = rows-1*ResPacketSizeHalf+1; + const Index n_quarter = rows-1*ResPacketSizeQuarter+1; + + // TODO: improve the following heuristic: + const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); + ResPacket palpha = pset1<ResPacket>(alpha); + ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha); + ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha); + + for(Index j2=0; j2<cols; j2+=block_cols) { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize); - - while (skipColumns<LhsPacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize)) - ++skipColumns; - if (skipColumns==LhsPacketSize) + Index jend = numext::mini(j2+block_cols,cols); + Index i=0; + for(; i<n8; i+=ResPacketSize*8) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipColumns = 0; + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)), + c4 = pset1<ResPacket>(ResScalar(0)), + c5 = pset1<ResPacket>(ResScalar(0)), + c6 = pset1<ResPacket>(ResScalar(0)), + c7 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); + pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4))); + pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5))); + pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6))); + pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7))); } - else + if(i<n4) { - skipColumns = (std::min)(skipColumns,cols); - // note that the skiped columns are processed later. - } + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)); - /* eigen_internal_assert( (alignmentPattern==NoneAligned) - || (skipColumns + columnsAtOnce >= cols) - || LhsPacketSize > size - || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ - } - else if(Vectorizable) - { - alignedStart = 0; - alignedSize = size; - alignmentPattern = AllAligned; - } - - const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); + pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); - Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; - for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce) - { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)), - ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)), - ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)), - ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0)); + i+=ResPacketSize*4; + } + if(i<n3) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)); - // this helps a lot generating better binary code - const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), - lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); + for(Index j=j2; j<jend; j+=1) + { + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); + } + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); - if (Vectorizable) + i+=ResPacketSize*3; + } + if(i<n2) { - /* explicit vectorization */ - // process initial unaligned coeffs - for (Index j=0; j<alignedStart; ++j) + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)); + + for(Index j=j2; j<jend; j+=1) { - res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); } - - if (alignedSize>alignedStart) + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); + i+=ResPacketSize*2; + } + if(i<n1) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); - break; - case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); - break; - case FirstAligned: - { - Index j = alignedStart; - if(peels>1) - { - LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; - ResPacket T0, T1; - - A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); - A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); - A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); - - for (; j<peeledSize; j+=peels*ResPacketSize) - { - A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - - A00 = lhs0.template load<LhsPacket, Aligned>(j); - A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize); - T0 = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j])); - T1 = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize])); - - T0 = pcj.pmadd(A01, ptmp1, T0); - A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); - T0 = pcj.pmadd(A02, ptmp2, T0); - A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); - T0 = pcj.pmadd(A03, ptmp3, T0); - pstore(&res[j],T0); - A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); - T1 = pcj.pmadd(A11, ptmp1, T1); - T1 = pcj.pmadd(A12, ptmp2, T1); - T1 = pcj.pmadd(A13, ptmp3, T1); - pstore(&res[j+ResPacketSize],T1); - } - } - for (; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); - break; - } - default: - for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); - break; - } + RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); } - } // end explicit vectorization - - /* process remaining coeffs (or all if there is no explicit vectorization) */ - for (Index j=alignedSize; j<size; ++j) + pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); + i+=ResPacketSize; + } + if(HasHalf && i<n_half) { - res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); - res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); - res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); - res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); + ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) + { + RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0)); + c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0); + } + pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0))); + i+=ResPacketSizeHalf; } - } - - // process remaining first and last columns (at most columnsAtOnce-1) - Index end = cols; - Index start = columnBound; - do - { - for (Index k=start; k<end; ++k) + if(HasQuarter && i<n_quarter) { - RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0)); - const LhsScalars lhs0 = lhs.getVectorMapper(0, k); - - if (Vectorizable) + ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0)); + for(Index j=j2; j<jend; j+=1) { - /* explicit vectorization */ - // process first unaligned result's coeffs - for (Index j=0; j<alignedStart; ++j) - res[j] += cj.pmul(lhs0(j), pfirst(ptmp0)); - // process aligned result's coeffs - if (lhs0.template aligned<LhsPacket>(alignedStart)) - for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i]))); - else - for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize) - pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i]))); + RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0)); + c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0); } - - // process remaining scalars (or all if no explicit vectorization) - for (Index i=alignedSize; i<size; ++i) - res[i] += cj.pmul(lhs0(i), pfirst(ptmp0)); + pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0))); + i+=ResPacketSizeQuarter; } - if (skipColumns) + for(;i<rows;++i) { - start = 0; - end = skipColumns; - skipColumns = 0; + ResScalar c0(0); + for(Index j=j2; j<jend; j+=1) + c0 += cj.pmul(lhs(i,j), rhs(j,0)); + res[i] += alpha*c0; } - else - break; - } while(Vectorizable); - #undef _EIGEN_ACCUMULATE_PACKETS + } } /* Optimized row-major matrix * vector product: - * This algorithm processes 4 rows at onces that allows to both reduce + * This algorithm processes 4 rows at once that allows to both reduce * the number of load/stores of the result by a factor 4 and to reduce * the instruction dependency. Moreover, we know that all bands have the * same alignment pattern. @@ -334,25 +297,25 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,C template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> { -typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; - -enum { - Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable - && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size), - LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1, - RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1, - ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1 -}; + typedef gemv_traits<LhsScalar,RhsScalar> Traits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; + typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; + + typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; -typedef typename packet_traits<LhsScalar>::type _LhsPacket; -typedef typename packet_traits<RhsScalar>::type _RhsPacket; -typedef typename packet_traits<ResScalar>::type _ResPacket; + typedef typename HalfTraits::LhsPacket LhsPacketHalf; + typedef typename HalfTraits::RhsPacket RhsPacketHalf; + typedef typename HalfTraits::ResPacket ResPacketHalf; -typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; -typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; -typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; + typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; + typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; + typedef typename QuarterTraits::ResPacket ResPacketQuarter; -EIGEN_DONT_INLINE static void run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -361,255 +324,191 @@ EIGEN_DONT_INLINE static void run( }; template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( +EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( Index rows, Index cols, - const LhsMapper& lhs, + const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - eigen_internal_assert(rhs.stride()==1); - - #ifdef _EIGEN_ACCUMULATE_PACKETS - #error _EIGEN_ACCUMULATE_PACKETS has already been defined - #endif - - #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ - RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); \ - ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \ - ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \ - ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \ - ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); } + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate propoer code. + LhsMapper lhs(alhs); + eigen_internal_assert(rhs.stride()==1); conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; - - typedef typename LhsMapper::VectorMapper LhsScalars; - - enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; - const Index rowsAtOnce = 4; - const Index peels = 2; - const Index RhsPacketAlignedMask = RhsPacketSize-1; - const Index LhsPacketAlignedMask = LhsPacketSize-1; - const Index depth = cols; - const Index lhsStride = lhs.stride(); - - // How many coeffs of the result do we have to skip to be aligned. - // Here we assume data are at least aligned on the base scalar type - // if that's not the case then vectorization is discarded, see below. - Index alignedStart = rhs.firstAligned(depth); - Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; - const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - - const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; - Index alignmentPattern = alignmentStep==0 ? AllAligned - : alignmentStep==(LhsPacketSize/2) ? EvenAligned - : FirstAligned; - - // we cannot assume the first element is aligned because of sub-matrices - const Index lhsAlignmentOffset = lhs.firstAligned(depth); - const Index rhsAlignmentOffset = rhs.firstAligned(rows); - - // find how many rows do we have to skip to be aligned with rhs (if possible) - Index skipRows = 0; - // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) - if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || - (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || - (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) - { - alignedSize = 0; - alignedStart = 0; - alignmentPattern = NoneAligned; - } - else if(LhsPacketSize > 4) - { - // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. - alignmentPattern = NoneAligned; - } - else if (LhsPacketSize>1) + conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; + conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. + const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; + const Index n4 = rows-3; + const Index n2 = rows-1; + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + ResPacketSizeHalf = HalfTraits::ResPacketSize, + ResPacketSizeQuarter = QuarterTraits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + LhsPacketSizeHalf = HalfTraits::LhsPacketSize, + LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, + HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, + HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf + }; + + Index i=0; + for(; i<n8; i+=8) { - // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth<LhsPacketSize); - - while (skipRows<LhsPacketSize && - alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize)) - ++skipRows; - if (skipRows==LhsPacketSize) + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)), + c4 = pset1<ResPacket>(ResScalar(0)), + c5 = pset1<ResPacket>(ResScalar(0)), + c6 = pset1<ResPacket>(ResScalar(0)), + c7 = pset1<ResPacket>(ResScalar(0)); + + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - // nothing can be aligned, no need to skip any column - alignmentPattern = NoneAligned; - skipRows = 0; + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); + + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); + c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4); + c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5); + c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6); + c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7); } - else + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + ResScalar cc4 = predux(c4); + ResScalar cc5 = predux(c5); + ResScalar cc6 = predux(c6); + ResScalar cc7 = predux(c7); + for(; j<cols; ++j) { - skipRows = (std::min)(skipRows,Index(rows)); - // note that the skiped columns are processed later. + RhsScalar b0 = rhs(j,0); + + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + cc2 += cj.pmul(lhs(i+2,j), b0); + cc3 += cj.pmul(lhs(i+3,j), b0); + cc4 += cj.pmul(lhs(i+4,j), b0); + cc5 += cj.pmul(lhs(i+5,j), b0); + cc6 += cj.pmul(lhs(i+6,j), b0); + cc7 += cj.pmul(lhs(i+7,j), b0); } - /* eigen_internal_assert( alignmentPattern==NoneAligned - || LhsPacketSize==1 - || (skipRows + rowsAtOnce >= rows) - || LhsPacketSize > depth - || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + res[(i+4)*resIncr] += alpha*cc4; + res[(i+5)*resIncr] += alpha*cc5; + res[(i+6)*resIncr] += alpha*cc6; + res[(i+7)*resIncr] += alpha*cc7; } - else if(Vectorizable) + for(; i<n4; i+=4) { - alignedStart = 0; - alignedSize = depth; - alignmentPattern = AllAligned; - } - - const Index offset1 = (FirstAligned && alignmentStep==1)?3:1; - const Index offset3 = (FirstAligned && alignmentStep==1)?1:3; + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)), + c2 = pset1<ResPacket>(ResScalar(0)), + c3 = pset1<ResPacket>(ResScalar(0)); - Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; - for (Index i=skipRows; i<rowBound; i+=rowsAtOnce) - { - // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ?? - EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); - ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0); - - // this helps the compiler generating good binary code - const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0), lhs1 = lhs.getVectorMapper(i+offset1, 0), - lhs2 = lhs.getVectorMapper(i+2, 0), lhs3 = lhs.getVectorMapper(i+offset3, 0); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); - if (Vectorizable) + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); + c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + ResScalar cc2 = predux(c2); + ResScalar cc3 = predux(c3); + for(; j<cols; ++j) { - /* explicit vectorization */ - ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)), - ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0)); + RhsScalar b0 = rhs(j,0); - // process initial unaligned coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; j<alignedStart; ++j) - { - RhsScalar b = rhs(j, 0); - tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); - tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); - } + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); + cc2 += cj.pmul(lhs(i+2,j), b0); + cc3 += cj.pmul(lhs(i+3,j), b0); + } + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; + res[(i+2)*resIncr] += alpha*cc2; + res[(i+3)*resIncr] += alpha*cc3; + } + for(; i<n2; i+=2) + { + ResPacket c0 = pset1<ResPacket>(ResScalar(0)), + c1 = pset1<ResPacket>(ResScalar(0)); - if (alignedSize>alignedStart) - { - switch(alignmentPattern) - { - case AllAligned: - for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned); - break; - case EvenAligned: - for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned); - break; - case FirstAligned: - { - Index j = alignedStart; - if (peels>1) - { - /* Here we proccess 4 rows with with two peeled iterations to hide - * the overhead of unaligned loads. Moreover unaligned loads are handled - * using special shift/move operations between the two aligned packets - * overlaping the desired unaligned packet. This is *much* more efficient - * than basic unaligned loads. - */ - LhsPacket A01, A02, A03, A11, A12, A13; - A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1); - A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2); - A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3); - - for (; j<peeledSize; j+=peels*RhsPacketSize) - { - RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0); - A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize); palign<1>(A01,A11); - A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize); palign<2>(A02,A12); - A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize); palign<3>(A03,A13); - - ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0); - ptmp1 = pcj.pmadd(A01, b, ptmp1); - A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize); palign<1>(A11,A01); - ptmp2 = pcj.pmadd(A02, b, ptmp2); - A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize); palign<2>(A12,A02); - ptmp3 = pcj.pmadd(A03, b, ptmp3); - A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize); palign<3>(A13,A03); - - b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0); - ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0); - ptmp1 = pcj.pmadd(A11, b, ptmp1); - ptmp2 = pcj.pmadd(A12, b, ptmp2); - ptmp3 = pcj.pmadd(A13, b, ptmp3); - } - } - for (; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned); - break; - } - default: - for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize) - _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned); - break; - } - tmp0 += predux(ptmp0); - tmp1 += predux(ptmp1); - tmp2 += predux(ptmp2); - tmp3 += predux(ptmp3); - } - } // end explicit vectorization + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + { + RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); - // process remaining coeffs (or all if no explicit vectorization) - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j<depth; ++j) + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); + c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); + } + ResScalar cc0 = predux(c0); + ResScalar cc1 = predux(c1); + for(; j<cols; ++j) { - RhsScalar b = rhs(j, 0); - tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b); - tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b); + RhsScalar b0 = rhs(j,0); + + cc0 += cj.pmul(lhs(i+0,j), b0); + cc1 += cj.pmul(lhs(i+1,j), b0); } - res[i*resIncr] += alpha*tmp0; - res[(i+offset1)*resIncr] += alpha*tmp1; - res[(i+2)*resIncr] += alpha*tmp2; - res[(i+offset3)*resIncr] += alpha*tmp3; + res[(i+0)*resIncr] += alpha*cc0; + res[(i+1)*resIncr] += alpha*cc1; } - - // process remaining first and last rows (at most columnsAtOnce-1) - Index end = rows; - Index start = rowBound; - do + for(; i<rows; ++i) { - for (Index i=start; i<end; ++i) + ResPacket c0 = pset1<ResPacket>(ResScalar(0)); + ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0)); + ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0)); + Index j=0; + for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) { - EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0); - ResPacket ptmp0 = pset1<ResPacket>(tmp0); - const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); - // process first unaligned result's coeffs - // FIXME this loop get vectorized by the compiler ! - for (Index j=0; j<alignedStart; ++j) - tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); - - if (alignedSize>alignedStart) - { - // process aligned rhs coeffs - if (lhs0.template aligned<LhsPacket>(alignedStart)) - for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); - else - for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize) - ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0); - tmp0 += predux(ptmp0); - } - - // process remaining scalars - // FIXME this loop get vectorized by the compiler ! - for (Index j=alignedSize; j<depth; ++j) - tmp0 += cj.pmul(lhs0(j), rhs(j, 0)); - res[i*resIncr] += alpha*tmp0; + RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0); + c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0); } - if (skipRows) + ResScalar cc0 = predux(c0); + if (HasHalf) { + for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + { + RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0); + c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h); + } + cc0 += predux(c0_h); + } + if (HasQuarter) { + for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) + { + RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0); + c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q); + } + cc0 += predux(c0_q); + } + for(; j<cols; ++j) { - start = 0; - end = skipRows; - skipRows = 0; + cc0 += cj.pmul(lhs(i,j), rhs(j,0)); } - else - break; - } while(Vectorizable); - - #undef _EIGEN_ACCUMULATE_PACKETS + res[i*resIncr] += alpha*cc0; + } } } // end namespace internal diff --git a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h index e3a5d5892..6e36c2b3c 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h @@ -85,7 +85,7 @@ EIGEN_BLAS_GEMV_SPECIALIZE(float) EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex) EIGEN_BLAS_GEMV_SPECIALIZE(scomplex) -#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASPREFIX) \ +#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE,BLASTYPE,BLASFUNC) \ template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \ struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index c0ddc0c06..8f91879e4 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -10,7 +10,9 @@ #ifndef EIGEN_PARALLELIZER_H #define EIGEN_PARALLELIZER_H +#if EIGEN_HAS_CXX11_ATOMIC #include <atomic> +#endif namespace Eigen { @@ -19,7 +21,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads) if(action==SetAction) { @@ -77,8 +80,17 @@ template<typename Index> struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} + // volatile is not enough on all architectures (see bug 1572) + // to guarantee that when thread A says to thread B that it is + // done with packing a block, then all writes have been really + // carried out... C++11 memory model+atomic guarantees this. +#if EIGEN_HAS_CXX11_ATOMIC std::atomic<Index> sync; std::atomic<int> users; +#else + Index volatile sync; + int volatile users; +#endif Index lhs_start; Index lhs_length; @@ -89,11 +101,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types -#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) + // Without C++11, we have to disable GEMM's parallelization on + // non x86 architectures because there volatile is not enough for our purpose. + // See bug 1572. +#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redisigned anyway. + // parallelizer mechanism has to be redesigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -114,12 +129,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, double work = static_cast<double>(rows) * static_cast<double>(cols) * static_cast<double>(depth); double kMinTaskSize = 50000; // FIXME improve this heuristic. - pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize)); + pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) )); // compute the number of threads we are going to use Index threads = std::min<Index>(nbThreads(), pb_max_threads); - // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index da6f82abc..33ecf10f6 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,14 +45,23 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - enum { PacketSize = packet_traits<Scalar>::size }; + typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket; + typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket; + enum { PacketSize = packet_traits<Scalar>::size, + HalfPacketSize = unpacket_traits<HalfPacket>::size, + QuarterPacketSize = unpacket_traits<QuarterPacket>::size, + HasHalf = (int)HalfPacketSize < (int)PacketSize, + HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; + const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i<peeled_mc3; i+=3*PacketSize) @@ -66,8 +75,16 @@ struct symm_pack_lhs for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize) pack<1*PacketSize>(blockA, lhs, cols, i, count); + if(HasHalf && Pack1>=HalfPacketSize) + for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize) + pack<HalfPacketSize>(blockA, lhs, cols, i, count); + + if(HasQuarter && Pack1>=QuarterPacketSize) + for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize) + pack<QuarterPacketSize>(blockA, lhs, cols, i, count); + // do the same with mr==1 - for(Index i=peeled_mc1; i<rows; i++) + for(Index i=peeled_mc_quarter; i<rows; i++) { for(Index k=0; k<i; k++) blockA[count++] = lhs(i, k); // normal @@ -277,20 +294,21 @@ struct symm_pack_rhs template <typename Scalar, typename Index, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, - int ResStorageOrder> + int ResStorageOrder, int ResInnerStride> struct product_selfadjoint_matrix; template <typename Scalar, typename Index, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs, - int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs> -struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor> + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride> { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { product_selfadjoint_matrix<Scalar, Index, @@ -298,33 +316,35 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), - ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor,ResInnerStride> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride> { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = rows; @@ -334,11 +354,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; LhsMapper lhs(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -352,7 +372,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; - gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; + gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed; for(Index k2=0; k2<size; k2+=kc) { @@ -387,7 +407,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t for(Index i2=k2+kc; i2<size; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,size)-i2; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>() + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -398,26 +418,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t // matrix * selfadjoint product template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride> { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs> -EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { Index size = cols; @@ -425,9 +447,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f typedef gebp_traits<Scalar,Scalar> Traits; typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride); + ResMapper res(_res,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -437,7 +459,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=0; k2<size; k2+=kc) @@ -503,12 +525,13 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false> NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), - internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor> + internal::traits<Dest>::Flags&RowMajorBit ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime> ::run( lhs.rows(), rhs.cols(), // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking // alpha ); } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d69..61396dbdf 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,20 +40,22 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -81,25 +83,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -144,33 +148,41 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -197,25 +209,27 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \ +struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -259,15 +273,21 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL ldb = convert_index<BlasIndex>(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector.h b/Eigen/src/Core/products/SelfadjointMatrixVector.h index 3fd180e6c..d38fd72b2 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -15,7 +15,7 @@ namespace Eigen { namespace internal { /* Optimized selfadjoint matrix * vector product: - * This algorithm processes 2 columns at onces that allows to both reduce + * This algorithm processes 2 columns at once that allows to both reduce * the number of load/stores of the result by a factor 2 and to reduce * the instruction dependency. */ @@ -27,7 +27,8 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju struct selfadjoint_matrix_vector_product { -static EIGEN_DONT_INLINE void run( +static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -36,7 +37,8 @@ static EIGEN_DONT_INLINE void run( }; template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version> -EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( +EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC +void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -62,8 +64,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha; - - Index bound = (std::max)(Index(0),size-8) & 0xfffffffe; + Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe; if (FirstTriangular) bound = size - bound; @@ -175,7 +176,8 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true> enum { LhsUpLo = LhsMode&(Upper|Lower) }; template<typename Dest> - static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h index 38f23accf..1238345e3 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h @@ -95,14 +95,21 @@ const EIGTYPE* _rhs, EIGTYPE* res, EIGTYPE alpha) \ x_tmp=map_x.conjugate(); \ x_ptr=x_tmp.data(); \ } else x_ptr=_rhs; \ - BLASFUNC(&uplo, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv) +EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv) +EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv) +#else EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_) EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_) EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_) +#endif } // end namespace internal diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f038d686f..a21be8050 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -109,10 +109,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> internal::general_matrix_matrix_triangular_product<Index, Scalar, OtherIsRowMajor ? RowMajor : ColMajor, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, - IsRowMajor ? RowMajor : ColMajor, UpLo> + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, - &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + actualOther.data(), actualOther.outerStride(), actualOther.data(), actualOther.outerStride(), + mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; @@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false> template<typename MatrixType, unsigned int UpLo> template<typename DerivedU> -SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha) { selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 2ae364111..f752a0bf0 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,7 +24,8 @@ struct selfadjoint_rank2_update_selector; template<typename Scalar, typename Index, typename UType, typename VType> struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower> { - static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static EIGEN_DEVICE_FUNC + void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i<size; ++i) @@ -57,7 +58,7 @@ template<bool Cond, typename T> struct conj_expr_if template<typename MatrixType, unsigned int UpLo> template<typename DerivedU, typename DerivedV> -SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> +EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha) { typedef internal::blas_traits<DerivedU> UBlasTraits; @@ -79,8 +80,8 @@ SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo> if (IsRowMajor) actualAlpha = numext::conj(actualAlpha); - typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType; - typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType; + typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType; + typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType; internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)> ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index 6ec5a8a0b..f0c60507a 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -45,22 +45,24 @@ template <typename Scalar, typename Index, int Mode, bool LhsIsTriangular, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs, - int ResStorageOrder, int Version = Specialized> + int ResStorageOrder, int ResInnerStride, + int Version = Specialized> struct product_triangular_matrix_matrix; template <typename Scalar, typename Index, int Mode, bool LhsIsTriangular, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, int Version> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, LhsStorageOrder,ConjugateLhs, - RhsStorageOrder,ConjugateRhs,RowMajor,Version> + RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version> { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { product_triangular_matrix_matrix<Scalar, Index, @@ -70,18 +72,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular, ConjugateRhs, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs, - ColMajor> - ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor, ResInnerStride> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, int Version> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, LhsStorageOrder,ConjugateLhs, - RhsStorageOrder,ConjugateRhs,ColMajor,Version> + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits<Scalar,Scalar> Traits; @@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, int Version> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, LhsStorageOrder,ConjugateLhs, - RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { // strip zeros @@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -137,7 +141,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -145,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, triangularBuffer.diagonal().setOnes(); gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -216,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, for(Index i2=start; i2<end; i2+=mc) { const Index actual_mc = (std::min)(i2+mc,end)-i2; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>() + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -229,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true, // implements col-major += alpha * op(general) * op(triangular) template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, int Version> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, LhsStorageOrder,ConjugateLhs, - RhsStorageOrder,ConjugateRhs,ColMajor,Version> + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits<Scalar,Scalar> Traits; enum { @@ -245,20 +256,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false, Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking); }; template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, - int RhsStorageOrder, bool ConjugateRhs, int Version> + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, LhsStorageOrder,ConjugateLhs, - RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking) { const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar); @@ -270,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper; - typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper; + typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -284,7 +296,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA()); ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB()); - Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer((internal::constructor_without_unaligned_array_assert())); + internal::constructor_without_unaligned_array_assert a; + Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -292,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false, triangularBuffer.diagonal().setOnes(); gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel; @@ -393,7 +406,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> { template<typename Dest> static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits<Lhs> LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -405,8 +420,9 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -423,14 +439,29 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false> Mode, LhsIsTriangular, (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor> + (internal::traits<Dest >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6bb..a98d12e4a 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -46,7 +46,7 @@ template <typename Scalar, typename Index, struct product_triangular_matrix_matrix_trmm : product_triangular_matrix_matrix<Scalar,Index,Mode, LhsIsTriangular,LhsStorageOrder,ConjugateLhs, - RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {}; + RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {}; // try to go to BLAS specialization @@ -55,13 +55,15 @@ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \ - LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ - const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \ LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \ RhsStorageOrder, ConjugateRhs, ColMajor>::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ } \ }; @@ -75,7 +77,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \ /* Most likely no benefit to call TRMM or GEMM from BLAS */ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \ - LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ - rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -172,7 +174,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ } \ /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +182,20 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \ } \ }; -EIGEN_BLAS_TRMM_L(double, double, d, d) -EIGEN_BLAS_TRMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_TRMM_L(float, float, f, s) -EIGEN_BLAS_TRMM_L(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMM_L(double, double, d, dtrmm) +EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm) +EIGEN_BLAS_TRMM_L(float, float, f, strmm) +EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm) +#else +EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_) +EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_) +EIGEN_BLAS_TRMM_L(float, float, f, strmm_) +EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_) +#endif // implements col-major += alpha * op(general) * op(triangular) -#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template <typename Index, int Mode, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ @@ -225,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \ /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \ product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \ - LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -234,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \ BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \ gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \ - rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -282,7 +291,7 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ } \ /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/ \ /* call ?trmm*/ \ - BLASPREFIX##trmm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)b, &ldb); \ \ /* Add op(a_triangular)*b into res*/ \ Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +299,17 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \ } \ }; -EIGEN_BLAS_TRMM_R(double, double, d, d) -EIGEN_BLAS_TRMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_TRMM_R(float, float, f, s) -EIGEN_BLAS_TRMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMM_R(double, double, d, dtrmm) +EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm) +EIGEN_BLAS_TRMM_R(float, float, f, strmm) +EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm) +#else +EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_) +EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_) +EIGEN_BLAS_TRMM_R(float, float, f, strmm_) +EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 4b292e74d..76bfa159c 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -221,8 +221,9 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1 @@ -274,6 +275,12 @@ template<int Mode> struct trmv_selector<Mode,ColMajor> else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template<int Mode> struct trmv_selector<Mode,RowMajor> typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template<int Mode> struct trmv_selector<Mode,RowMajor> actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 07bf26ce5..3d47a2b94 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ @@ -142,18 +142,25 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, m = convert_index<BlasIndex>(size); \ n = convert_index<BlasIndex>(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, diag = IsUnitDiag ? 'U' : 'N'; \ \ /* call ?TRMV*/ \ - BLASPREFIX##trmv_(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ + BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \ \ /* Add op(a_tr)rhs into res*/ \ - BLASPREFIX##axpy_(&n, &numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ + BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \ /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \ if (size<(std::max)(rows,cols)) { \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ @@ -224,15 +231,22 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE, m = convert_index<BlasIndex>(size); \ n = convert_index<BlasIndex>(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b86..6d879ba00 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -15,48 +15,48 @@ namespace Eigen { namespace internal { // if the rhs is row major, let's transpose the product -template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder> -struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor> +template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride> { static void run( Index size, Index cols, const Scalar* tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking<Scalar,Scalar>& blocking) { triangular_solve_matrix< Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), NumTraits<Scalar>::IsComplex && Conjugate, - TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> - ::run(size, cols, tri, triStride, _other, otherStride, blocking); + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride> + ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking); } }; /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left */ -template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> -struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride> { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking<Scalar,Scalar>& blocking); }; -template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> -EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run( +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking<Scalar,Scalar>& blocking) { Index cols = otherSize; typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper; - typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper; TriMapper tri(_tri, triStride); - OtherMapper other(_other, otherStride); + OtherMapper other(_other, otherStride, otherIncr); typedef gebp_traits<Scalar,Scalar> Traits; @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju conj_if<Conjugate> conj; gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel; - gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs; + gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs; gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -128,19 +128,21 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju { Scalar b(0); const Scalar* l = &tri(i,s); - Scalar* r = &other(s,j); + typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j); for (Index i3=0; i3<k; ++i3) - b += conj(l[i3]) * r[i3]; + b += conj(l[i3]) * r(i3); other(i,j) = (other(i,j) - b)*a; } else { - Scalar b = (other(i,j) *= a); - Scalar* r = &other(s,j); - const Scalar* l = &tri(s,i); + Scalar& otherij = other(i,j); + otherij *= a; + Scalar b = otherij; + typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j); + typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i); for (Index i3=0;i3<rs;++i3) - r[i3] -= b * conj(l[i3]); + r(i3) -= b * conj(l(i3)); } } } @@ -185,28 +187,28 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju /* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right */ -template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> -struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride> { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking<Scalar,Scalar>& blocking); }; -template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder> -EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run( +template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride> +EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking<Scalar,Scalar>& blocking) { Index rows = otherSize; typedef typename NumTraits<Scalar>::Real RealScalar; - typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper; + typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper; typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper; - LhsMapper lhs(_other, otherStride); + LhsMapper lhs(_other, otherStride, otherIncr); RhsMapper rhs(_tri, triStride); typedef gebp_traits<Scalar,Scalar> Traits; @@ -229,7 +231,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs; gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel; - gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel; + gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2<size; @@ -297,24 +299,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj { Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k; - Scalar* r = &lhs(i2,j); + typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j); for (Index k3=0; k3<k; ++k3) { Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j)); - Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3); + typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3); for (Index i=0; i<actual_mc; ++i) - r[i] -= a[i] * b; + r(i) -= a(i) * b; } if((Mode & UnitDiag)==0) { Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j)); for (Index i=0; i<actual_mc; ++i) - r[i] *= inv_rjj; + r(i) *= inv_rjj; } } // pack the just computed part of lhs to A - pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride), + pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2), actualPanelWidth, actual_mc, actual_kc, j2); } diff --git a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h index 88c0fb794..621194ce6 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h @@ -38,9 +38,9 @@ namespace Eigen { namespace internal { // implements LeftSide op(triangular)^-1 * general -#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASPREFIX) \ +#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ -struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \ +struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -51,8 +51,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage static void run( \ Index size, Index otherSize, \ const EIGTYPE* _tri, Index triStride, \ - EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -80,20 +82,26 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ } \ }; -EIGEN_BLAS_TRSM_L(double, double, d) -EIGEN_BLAS_TRSM_L(dcomplex, double, z) -EIGEN_BLAS_TRSM_L(float, float, s) -EIGEN_BLAS_TRSM_L(scomplex, float, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRSM_L(double, double, dtrsm) +EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm) +EIGEN_BLAS_TRSM_L(float, float, strsm) +EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm) +#else +EIGEN_BLAS_TRSM_L(double, double, dtrsm_) +EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_) +EIGEN_BLAS_TRSM_L(float, float, strsm_) +EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_) +#endif // implements RightSide general * op(triangular)^-1 -#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASPREFIX) \ +#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \ template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \ -struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \ +struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -104,8 +112,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag static void run( \ Index size, Index otherSize, \ const EIGTYPE* _tri, Index triStride, \ - EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -133,16 +143,22 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag } \ if (IsUnitDiag) diag='U'; \ /* call ?trsm*/ \ - BLASPREFIX##trsm_(&side, &uplo, &transa, &diag, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ + BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (BLASTYPE*)_other, &ldb); \ /*std::cout << "TRMS_L specialization!\n";*/ \ } \ }; -EIGEN_BLAS_TRSM_R(double, double, d) -EIGEN_BLAS_TRSM_R(dcomplex, double, z) -EIGEN_BLAS_TRSM_R(float, float, s) -EIGEN_BLAS_TRSM_R(scomplex, float, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRSM_R(double, double, dtrsm) +EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm) +EIGEN_BLAS_TRSM_R(float, float, strsm) +EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm) +#else +EIGEN_BLAS_TRSM_R(double, double, dtrsm_) +EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_) +EIGEN_BLAS_TRSM_R(float, float, strsm_) +EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_) +#endif } // end namespace internal diff --git a/Eigen/src/Core/products/TriangularSolverVector.h b/Eigen/src/Core/products/TriangularSolverVector.h index b994759b2..647317016 100644 --- a/Eigen/src/Core/products/TriangularSolverVector.h +++ b/Eigen/src/Core/products/TriangularSolverVector.h @@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime Index startRow = IsLower ? pi : pi-actualPanelWidth; Index startCol = IsLower ? 0 : pi; @@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con if (k>0) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum(); - if(!(Mode & UnitDiag)) + if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) rhs[i] /= cjLhs(i,i); } } @@ -114,20 +114,23 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con for(Index k=0; k<actualPanelWidth; ++k) { Index i = IsLower ? pi+k : pi-k-1; - if(!(Mode & UnitDiag)) - rhs[i] /= cjLhs.coeff(i,i); - - Index r = actualPanelWidth - k - 1; // remaining size - Index s = IsLower ? i+1 : i-r; - if (r>0) - Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + if(numext::not_equal_strict(rhs[i],RhsScalar(0))) + { + if(!(Mode & UnitDiag)) + rhs[i] /= cjLhs.coeff(i,i); + + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); + } } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slighlty faster at runtime + // 2 - it is slightly faster at runtime general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), |