aboutsummaryrefslogtreecommitdiff
path: root/Eigen/src/Core/products/GeneralBlockPanelKernel.h
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/Core/products/GeneralBlockPanelKernel.h')
-rw-r--r--Eigen/src/Core/products/GeneralBlockPanelKernel.h302
1 files changed, 162 insertions, 140 deletions
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 5eb03c98c..bcdca5b0d 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -69,8 +69,8 @@ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdi
* - the number of scalars that fit into a packet (when vectorization is enabled).
*
* \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar, int KcFactor>
-void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
+template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
+void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
{
EIGEN_UNUSED_VARIABLE(n);
// Explanations:
@@ -91,13 +91,13 @@ void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrd
};
manage_caching_sizes(GetAction, &l1, &l2);
- k = std::min<std::ptrdiff_t>(k, l1/kdiv);
- std::ptrdiff_t _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
+ k = std::min<SizeType>(k, l1/kdiv);
+ SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
if(_m<m) m = _m & mr_mask;
}
-template<typename LhsScalar, typename RhsScalar>
-inline void computeProductBlockingSizes(std::ptrdiff_t& k, std::ptrdiff_t& m, std::ptrdiff_t& n)
+template<typename LhsScalar, typename RhsScalar, typename SizeType>
+inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
{
computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
}
@@ -527,9 +527,16 @@ struct gebp_kernel
ResPacketSize = Traits::ResPacketSize
};
- EIGEN_DONT_INLINE EIGEN_FLATTEN_ATTRIB
+ EIGEN_DONT_INLINE
void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
- Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB = 0)
+ Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+};
+
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE
+void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
+ ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
+ Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
{
Traits traits;
@@ -1089,7 +1096,7 @@ EIGEN_ASM_COMMENT("mybegin4");
}
}
}
-};
+
#undef CJMADD
@@ -1110,80 +1117,85 @@ EIGEN_ASM_COMMENT("mybegin4");
template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs
{
- EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows,
- Index stride=0, Index offset=0)
+ EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
+ ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
+{
+ typedef typename packet_traits<Scalar>::type Packet;
+ enum { PacketSize = packet_traits<Scalar>::size };
+
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+ EIGEN_UNUSED_VARIABLE(stride)
+ EIGEN_UNUSED_VARIABLE(offset)
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
+ Index count = 0;
+ Index peeled_mc = (rows/Pack1)*Pack1;
+ for(Index i=0; i<peeled_mc; i+=Pack1)
{
- typedef typename packet_traits<Scalar>::type Packet;
- enum { PacketSize = packet_traits<Scalar>::size };
-
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
- eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
- Index count = 0;
- Index peeled_mc = (rows/Pack1)*Pack1;
- for(Index i=0; i<peeled_mc; i+=Pack1)
- {
- if(PanelMode) count += Pack1 * offset;
+ if(PanelMode) count += Pack1 * offset;
- if(StorageOrder==ColMajor)
+ if(StorageOrder==ColMajor)
+ {
+ for(Index k=0; k<depth; k++)
{
- for(Index k=0; k<depth; k++)
- {
- Packet A, B, C, D;
- if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
- if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
- if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
- if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
- if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
- if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
- if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
- if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
- }
+ Packet A, B, C, D;
+ if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
+ if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
+ if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
+ if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
+ if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
+ if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
+ if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
+ if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
}
- else
+ }
+ else
+ {
+ for(Index k=0; k<depth; k++)
{
- for(Index k=0; k<depth; k++)
+ // TODO add a vectorized transpose here
+ Index w=0;
+ for(; w<Pack1-3; w+=4)
{
- // TODO add a vectorized transpose here
- Index w=0;
- for(; w<Pack1-3; w+=4)
- {
- Scalar a(cj(lhs(i+w+0, k))),
- b(cj(lhs(i+w+1, k))),
- c(cj(lhs(i+w+2, k))),
- d(cj(lhs(i+w+3, k)));
- blockA[count++] = a;
- blockA[count++] = b;
- blockA[count++] = c;
- blockA[count++] = d;
- }
- if(Pack1%4)
- for(;w<Pack1;++w)
- blockA[count++] = cj(lhs(i+w, k));
+ Scalar a(cj(lhs(i+w+0, k))),
+ b(cj(lhs(i+w+1, k))),
+ c(cj(lhs(i+w+2, k))),
+ d(cj(lhs(i+w+3, k)));
+ blockA[count++] = a;
+ blockA[count++] = b;
+ blockA[count++] = c;
+ blockA[count++] = d;
}
+ if(Pack1%4)
+ for(;w<Pack1;++w)
+ blockA[count++] = cj(lhs(i+w, k));
}
- if(PanelMode) count += Pack1 * (stride-offset-depth);
- }
- if(rows-peeled_mc>=Pack2)
- {
- if(PanelMode) count += Pack2*offset;
- for(Index k=0; k<depth; k++)
- for(Index w=0; w<Pack2; w++)
- blockA[count++] = cj(lhs(peeled_mc+w, k));
- if(PanelMode) count += Pack2 * (stride-offset-depth);
- peeled_mc += Pack2;
- }
- for(Index i=peeled_mc; i<rows; i++)
- {
- if(PanelMode) count += offset;
- for(Index k=0; k<depth; k++)
- blockA[count++] = cj(lhs(i, k));
- if(PanelMode) count += (stride-offset-depth);
}
+ if(PanelMode) count += Pack1 * (stride-offset-depth);
}
-};
+ if(rows-peeled_mc>=Pack2)
+ {
+ if(PanelMode) count += Pack2*offset;
+ for(Index k=0; k<depth; k++)
+ for(Index w=0; w<Pack2; w++)
+ blockA[count++] = cj(lhs(peeled_mc+w, k));
+ if(PanelMode) count += Pack2 * (stride-offset-depth);
+ peeled_mc += Pack2;
+ }
+ for(Index i=peeled_mc; i<rows; i++)
+ {
+ if(PanelMode) count += offset;
+ for(Index k=0; k<depth; k++)
+ blockA[count++] = cj(lhs(i, k));
+ if(PanelMode) count += (stride-offset-depth);
+ }
+}
// copy a complete panel of the rhs
// this version is optimized for column major matrices
@@ -1197,92 +1209,102 @@ struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
- Index stride=0, Index offset=0)
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride)
+ EIGEN_UNUSED_VARIABLE(offset)
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols = (cols/nr) * nr;
+ Index count = 0;
+ for(Index j2=0; j2<packet_cols; j2+=nr)
{
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- Index packet_cols = (cols/nr) * nr;
- Index count = 0;
- for(Index j2=0; j2<packet_cols; j2+=nr)
+ // skip what we have before
+ if(PanelMode) count += nr * offset;
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+ const Scalar* b1 = &rhs[(j2+1)*rhsStride];
+ const Scalar* b2 = &rhs[(j2+2)*rhsStride];
+ const Scalar* b3 = &rhs[(j2+3)*rhsStride];
+ for(Index k=0; k<depth; k++)
{
- // skip what we have before
- if(PanelMode) count += nr * offset;
- const Scalar* b0 = &rhs[(j2+0)*rhsStride];
- const Scalar* b1 = &rhs[(j2+1)*rhsStride];
- const Scalar* b2 = &rhs[(j2+2)*rhsStride];
- const Scalar* b3 = &rhs[(j2+3)*rhsStride];
- for(Index k=0; k<depth; k++)
- {
- blockB[count+0] = cj(b0[k]);
- blockB[count+1] = cj(b1[k]);
- if(nr==4) blockB[count+2] = cj(b2[k]);
- if(nr==4) blockB[count+3] = cj(b3[k]);
- count += nr;
- }
- // skip what we have after
- if(PanelMode) count += nr * (stride-offset-depth);
+ blockB[count+0] = cj(b0[k]);
+ blockB[count+1] = cj(b1[k]);
+ if(nr==4) blockB[count+2] = cj(b2[k]);
+ if(nr==4) blockB[count+3] = cj(b3[k]);
+ count += nr;
}
+ // skip what we have after
+ if(PanelMode) count += nr * (stride-offset-depth);
+ }
- // copy the remaining columns one at a time (nr==1)
- for(Index j2=packet_cols; j2<cols; ++j2)
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols; j2<cols; ++j2)
+ {
+ if(PanelMode) count += offset;
+ const Scalar* b0 = &rhs[(j2+0)*rhsStride];
+ for(Index k=0; k<depth; k++)
{
- if(PanelMode) count += offset;
- const Scalar* b0 = &rhs[(j2+0)*rhsStride];
- for(Index k=0; k<depth; k++)
- {
- blockB[count] = cj(b0[k]);
- count += 1;
- }
- if(PanelMode) count += (stride-offset-depth);
+ blockB[count] = cj(b0[k]);
+ count += 1;
}
+ if(PanelMode) count += (stride-offset-depth);
}
-};
+}
// this version is optimized for row major matrices
template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
{
enum { PacketSize = packet_traits<Scalar>::size };
- EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols,
- Index stride=0, Index offset=0)
+ EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+};
+
+template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
+ ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
+{
+ EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+ EIGEN_UNUSED_VARIABLE(stride)
+ EIGEN_UNUSED_VARIABLE(offset)
+ eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+ conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+ Index packet_cols = (cols/nr) * nr;
+ Index count = 0;
+ for(Index j2=0; j2<packet_cols; j2+=nr)
{
- EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
- eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
- conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
- Index packet_cols = (cols/nr) * nr;
- Index count = 0;
- for(Index j2=0; j2<packet_cols; j2+=nr)
+ // skip what we have before
+ if(PanelMode) count += nr * offset;
+ for(Index k=0; k<depth; k++)
{
- // skip what we have before
- if(PanelMode) count += nr * offset;
- for(Index k=0; k<depth; k++)
- {
- const Scalar* b0 = &rhs[k*rhsStride + j2];
- blockB[count+0] = cj(b0[0]);
- blockB[count+1] = cj(b0[1]);
- if(nr==4) blockB[count+2] = cj(b0[2]);
- if(nr==4) blockB[count+3] = cj(b0[3]);
- count += nr;
- }
- // skip what we have after
- if(PanelMode) count += nr * (stride-offset-depth);
+ const Scalar* b0 = &rhs[k*rhsStride + j2];
+ blockB[count+0] = cj(b0[0]);
+ blockB[count+1] = cj(b0[1]);
+ if(nr==4) blockB[count+2] = cj(b0[2]);
+ if(nr==4) blockB[count+3] = cj(b0[3]);
+ count += nr;
}
- // copy the remaining columns one at a time (nr==1)
- for(Index j2=packet_cols; j2<cols; ++j2)
+ // skip what we have after
+ if(PanelMode) count += nr * (stride-offset-depth);
+ }
+ // copy the remaining columns one at a time (nr==1)
+ for(Index j2=packet_cols; j2<cols; ++j2)
+ {
+ if(PanelMode) count += offset;
+ const Scalar* b0 = &rhs[j2];
+ for(Index k=0; k<depth; k++)
{
- if(PanelMode) count += offset;
- const Scalar* b0 = &rhs[j2];
- for(Index k=0; k<depth; k++)
- {
- blockB[count] = cj(b0[k*rhsStride]);
- count += 1;
- }
- if(PanelMode) count += stride-offset-depth;
+ blockB[count] = cj(b0[k*rhsStride]);
+ count += 1;
}
+ if(PanelMode) count += stride-offset-depth;
}
-};
+}
} // end namespace internal