aboutsummaryrefslogtreecommitdiff
path: root/Eigen/src/LU
diff options
context:
space:
mode:
Diffstat (limited to 'Eigen/src/LU')
-rw-r--r--Eigen/src/LU/Determinant.h54
-rw-r--r--Eigen/src/LU/FullPivLU.h72
-rw-r--r--Eigen/src/LU/InverseImpl.h35
-rw-r--r--Eigen/src/LU/PartialPivLU.h129
-rw-r--r--Eigen/src/LU/arch/InverseSize4.h351
-rw-r--r--Eigen/src/LU/arch/Inverse_SSE.h338
6 files changed, 512 insertions, 467 deletions
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h
index d6a3c1e5a..3a41e6fcb 100644
--- a/Eigen/src/LU/Determinant.h
+++ b/Eigen/src/LU/Determinant.h
@@ -15,6 +15,7 @@ namespace Eigen {
namespace internal {
template<typename Derived>
+EIGEN_DEVICE_FUNC
inline const typename Derived::Scalar bruteforce_det3_helper
(const MatrixBase<Derived>& matrix, int a, int b, int c)
{
@@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper
* (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
}
-template<typename Derived>
-const typename Derived::Scalar bruteforce_det4_helper
-(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
-{
- return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
- * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
-}
-
template<typename Derived,
int DeterminantType = Derived::RowsAtCompileTime
> struct determinant_impl
@@ -44,7 +37,8 @@ template<typename Derived,
template<typename Derived> struct determinant_impl<Derived, 1>
{
- static inline typename traits<Derived>::Scalar run(const Derived& m)
+ static inline EIGEN_DEVICE_FUNC
+ typename traits<Derived>::Scalar run(const Derived& m)
{
return m.coeff(0,0);
}
@@ -52,7 +46,8 @@ template<typename Derived> struct determinant_impl<Derived, 1>
template<typename Derived> struct determinant_impl<Derived, 2>
{
- static inline typename traits<Derived>::Scalar run(const Derived& m)
+ static inline EIGEN_DEVICE_FUNC
+ typename traits<Derived>::Scalar run(const Derived& m)
{
return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
}
@@ -60,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 2>
template<typename Derived> struct determinant_impl<Derived, 3>
{
- static inline typename traits<Derived>::Scalar run(const Derived& m)
+ static inline EIGEN_DEVICE_FUNC
+ typename traits<Derived>::Scalar run(const Derived& m)
{
return bruteforce_det3_helper(m,0,1,2)
- bruteforce_det3_helper(m,1,0,2)
@@ -70,15 +66,34 @@ template<typename Derived> struct determinant_impl<Derived, 3>
template<typename Derived> struct determinant_impl<Derived, 4>
{
- static typename traits<Derived>::Scalar run(const Derived& m)
+ typedef typename traits<Derived>::Scalar Scalar;
+ static EIGEN_DEVICE_FUNC
+ Scalar run(const Derived& m)
+ {
+ Scalar d2_01 = det2(m, 0, 1);
+ Scalar d2_02 = det2(m, 0, 2);
+ Scalar d2_03 = det2(m, 0, 3);
+ Scalar d2_12 = det2(m, 1, 2);
+ Scalar d2_13 = det2(m, 1, 3);
+ Scalar d2_23 = det2(m, 2, 3);
+ Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12);
+ Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02);
+ Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01);
+ Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01);
+ return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) +
+ internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3);
+ }
+protected:
+ static EIGEN_DEVICE_FUNC
+ Scalar det2(const Derived& m, Index i0, Index i1)
+ {
+ return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1);
+ }
+
+ static EIGEN_DEVICE_FUNC
+ Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2)
{
- // trick by Martin Costabel to compute 4x4 det with only 30 muls
- return bruteforce_det4_helper(m,0,1,2,3)
- - bruteforce_det4_helper(m,0,2,1,3)
- + bruteforce_det4_helper(m,0,3,1,2)
- + bruteforce_det4_helper(m,1,2,0,3)
- - bruteforce_det4_helper(m,1,3,0,2)
- + bruteforce_det4_helper(m,2,3,0,1);
+ return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2));
}
};
@@ -89,6 +104,7 @@ template<typename Derived> struct determinant_impl<Derived, 4>
* \returns the determinant of this matrix
*/
template<typename Derived>
+EIGEN_DEVICE_FUNC
inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
{
eigen_assert(rows() == cols());
diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h
index 03b6af706..ba1749fa6 100644
--- a/Eigen/src/LU/FullPivLU.h
+++ b/Eigen/src/LU/FullPivLU.h
@@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
enum { Flags = 0 };
};
@@ -48,12 +49,12 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
* The data of the LU decomposition can be directly accessed through the methods matrixLU(),
* permutationP(), permutationQ().
*
- * As an exemple, here is how the original matrix can be retrieved:
+ * As an example, here is how the original matrix can be retrieved:
* \include class_FullPivLU.cpp
* Output: \verbinclude class_FullPivLU.out
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
- *
+ *
* \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
*/
template<typename _MatrixType> class FullPivLU
@@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU
public:
typedef _MatrixType MatrixType;
typedef SolverBase<FullPivLU> Base;
+ friend class SolverBase<FullPivLU>;
EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
- // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU
return internal::image_retval<FullPivLU>(*this, originalMatrix);
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** \return a solution x to the equation Ax=b, where A is the matrix of which
* *this is the LU decomposition.
*
@@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU
*
* \sa TriangularView::solve(), kernel(), inverse()
*/
- // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
template<typename Rhs>
inline const Solve<FullPivLU, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "LU is not initialized.");
- return Solve<FullPivLU, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
the LU decomposition.
@@ -320,7 +318,7 @@ template<typename _MatrixType> class FullPivLU
return m_usePrescribedThreshold ? m_prescribedThreshold
// this formula comes from experimenting (see "LU precision tuning" thread on the list)
// and turns out to be identical to Higham's formula used already in LDLt.
- : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
+ : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
}
/** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -406,16 +404,16 @@ template<typename _MatrixType> class FullPivLU
MatrixType reconstructedMatrix() const;
- EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); }
- EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); }
+ EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+ inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+ EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+ inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
- EIGEN_DEVICE_FUNC
void _solve_impl(const RhsType &rhs, DstType &dst) const;
template<bool Conjugate, typename RhsType, typename DstType>
- EIGEN_DEVICE_FUNC
void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
#endif
@@ -531,8 +529,8 @@ void FullPivLU<MatrixType>::computeInPlace()
m_nonzero_pivots = k;
for(Index i = k; i < size; ++i)
{
- m_rowsTranspositions.coeffRef(i) = i;
- m_colsTranspositions.coeffRef(i) = i;
+ m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+ m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
}
break;
}
@@ -543,8 +541,8 @@ void FullPivLU<MatrixType>::computeInPlace()
// Now that we've found the pivot, we need to apply the row/col swaps to
// bring it to the location (k,k).
- m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
- m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
+ m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+ m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
if(k != row_of_biggest_in_corner) {
m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
++number_of_transpositions;
@@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
const Index rows = this->rows(),
cols = this->cols(),
nonzero_pivots = this->rank();
- eigen_assert(rhs.rows() == rows);
const Index smalldim = (std::min)(rows, cols);
if(nonzero_pivots == 0)
@@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
const Index rows = this->rows(), cols = this->cols(),
nonzero_pivots = this->rank();
- eigen_assert(rhs.rows() == cols);
const Index smalldim = (std::min)(rows, cols);
if(nonzero_pivots == 0)
@@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
// Step 1
c = permutationQ().inverse() * rhs;
- if (Conjugate) {
- // Step 2
- m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
- .template triangularView<Upper>()
- .adjoint()
- .solveInPlace(c.topRows(nonzero_pivots));
- // Step 3
- m_lu.topLeftCorner(smalldim, smalldim)
- .template triangularView<UnitLower>()
- .adjoint()
- .solveInPlace(c.topRows(smalldim));
- } else {
- // Step 2
- m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
- .template triangularView<Upper>()
- .transpose()
- .solveInPlace(c.topRows(nonzero_pivots));
- // Step 3
- m_lu.topLeftCorner(smalldim, smalldim)
- .template triangularView<UnitLower>()
- .transpose()
- .solveInPlace(c.topRows(smalldim));
- }
+ // Step 2
+ m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+ .template triangularView<Upper>()
+ .transpose()
+ .template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(nonzero_pivots));
+
+ // Step 3
+ m_lu.topLeftCorner(smalldim, smalldim)
+ .template triangularView<UnitLower>()
+ .transpose()
+ .template conjugateIf<Conjugate>()
+ .solveInPlace(c.topRows(smalldim));
// Step 4
PermutationPType invp = permutationP().inverse().eval();
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 018f99b58..a40cefa9e 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper(
const MatrixType& matrix, const typename ResultType::Scalar& invdet,
ResultType& result)
{
+ typename ResultType::Scalar temp = matrix.coeff(0,0);
result.coeffRef(0,0) = matrix.coeff(1,1) * invdet;
result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
- result.coeffRef(1,1) = matrix.coeff(0,0) * invdet;
+ result.coeffRef(1,1) = temp * invdet;
}
template<typename MatrixType, typename ResultType>
@@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper(
const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
ResultType& result)
{
- result.row(0) = cofactors_col0 * invdet;
- result.coeffRef(1,0) = cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
- result.coeffRef(1,1) = cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+ // Compute cofactors in a way that avoids aliasing issues.
+ typedef typename ResultType::Scalar Scalar;
+ const Scalar c01 = cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
+ const Scalar c11 = cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+ const Scalar c02 = cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
result.coeffRef(1,2) = cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
- result.coeffRef(2,0) = cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
result.coeffRef(2,1) = cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
result.coeffRef(2,2) = cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
+ result.coeffRef(1,0) = c01;
+ result.coeffRef(1,1) = c11;
+ result.coeffRef(2,0) = c02;
+ result.row(0) = cofactors_col0 * invdet;
}
template<typename MatrixType, typename ResultType>
@@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
bool& invertible
)
{
- using std::abs;
typedef typename ResultType::Scalar Scalar;
Matrix<Scalar,3,1> cofactors_col0;
cofactors_col0.coeffRef(0) = cofactor_3x3<MatrixType,0,0>(matrix);
cofactors_col0.coeffRef(1) = cofactor_3x3<MatrixType,1,0>(matrix);
cofactors_col0.coeffRef(2) = cofactor_3x3<MatrixType,2,0>(matrix);
determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
- invertible = abs(determinant) > absDeterminantThreshold;
+ invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold;
if(!invertible) return;
const Scalar invdet = Scalar(1) / determinant;
compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
@@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
using std::abs;
determinant = matrix.determinant();
invertible = abs(determinant) > absDeterminantThreshold;
- if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+ if(invertible && extract_data(matrix) != extract_data(inverse)) {
+ compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+ }
+ else if(invertible) {
+ MatrixType matrix_t = matrix;
+ compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse);
+ }
}
};
@@ -290,6 +301,7 @@ template<typename DstXprType, typename XprType>
struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
{
typedef Inverse<XprType> SrcXprType;
+ EIGEN_DEVICE_FUNC
static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
{
Index dstRows = src.rows();
@@ -332,6 +344,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
* \sa computeInverseAndDetWithCheck()
*/
template<typename Derived>
+EIGEN_DEVICE_FUNC
inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
{
EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
@@ -345,6 +358,8 @@ inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
*
* This is only for fixed-size square matrices of size up to 4x4.
*
+ * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+ *
* \param inverse Reference to the matrix in which to store the inverse.
* \param determinant Reference to the variable in which to store the determinant.
* \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
@@ -385,6 +400,8 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
*
* This is only for fixed-size square matrices of size up to 4x4.
*
+ * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+ *
* \param inverse Reference to the matrix in which to store the inverse.
* \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
* \param absDeterminantThreshold Optional parameter controlling the invertibility check.
@@ -404,7 +421,7 @@ inline void MatrixBase<Derived>::computeInverseWithCheck(
const RealScalar& absDeterminantThreshold
) const
{
- RealScalar determinant;
+ Scalar determinant;
// i'd love to put some static assertions there, but SFINAE means that they have no effect...
eigen_assert(rows() == cols());
computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index d43961887..34aed7249 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
{
typedef MatrixXpr XprKind;
typedef SolverStorage StorageKind;
+ typedef int StorageIndex;
typedef traits<_MatrixType> BaseTraits;
enum {
Flags = BaseTraits::Flags & RowMajorBit,
@@ -69,7 +70,7 @@ struct enable_if_ref<Ref<T>,Derived> {
* The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
*
* This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
- *
+ *
* \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
*/
template<typename _MatrixType> class PartialPivLU
@@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU
typedef _MatrixType MatrixType;
typedef SolverBase<PartialPivLU> Base;
+ friend class SolverBase<PartialPivLU>;
+
EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
- // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
enum {
MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU
return m_p;
}
+ #ifdef EIGEN_PARSED_BY_DOXYGEN
/** This method returns the solution x to the equation Ax=b, where A is the matrix of which
* *this is the LU decomposition.
*
@@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU
*
* \sa TriangularView::solve(), inverse(), computeInverse()
*/
- // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
template<typename Rhs>
inline const Solve<PartialPivLU, Rhs>
- solve(const MatrixBase<Rhs>& b) const
- {
- eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
- return Solve<PartialPivLU, Rhs>(*this, b.derived());
- }
+ solve(const MatrixBase<Rhs>& b) const;
+ #endif
/** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
the LU decomposition.
@@ -217,8 +216,8 @@ template<typename _MatrixType> class PartialPivLU
MatrixType reconstructedMatrix() const;
- inline Index rows() const { return m_lu.rows(); }
- inline Index cols() const { return m_lu.cols(); }
+ EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); }
+ EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); }
#ifndef EIGEN_PARSED_BY_DOXYGEN
template<typename RhsType, typename DstType>
@@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU
* Step 3: replace c by the solution x to Ux = c.
*/
- eigen_assert(rhs.rows() == m_lu.rows());
-
// Step 1
dst = permutationP() * rhs;
@@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU
template<bool Conjugate, typename RhsType, typename DstType>
EIGEN_DEVICE_FUNC
void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
- /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+ /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
* So we proceed as follows:
- * Step 1: compute c = Pb.
- * Step 2: replace c by the solution x to Lx = c.
- * Step 3: replace c by the solution x to Ux = c.
+ * Step 1: compute c as the solution to L^T c = b
+ * Step 2: replace c by the solution x to U^T x = c.
+ * Step 3: update c = P^-1 c.
*/
eigen_assert(rhs.rows() == m_lu.cols());
- if (Conjugate) {
- // Step 1
- dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
- // Step 2
- m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
- } else {
- // Step 1
- dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
- // Step 2
- m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
- }
+ // Step 1
+ dst = m_lu.template triangularView<Upper>().transpose()
+ .template conjugateIf<Conjugate>().solve(rhs);
+ // Step 2
+ m_lu.template triangularView<UnitLower>().transpose()
+ .template conjugateIf<Conjugate>().solveInPlace(dst);
// Step 3
dst = permutationP().transpose() * dst;
}
@@ -339,17 +331,18 @@ PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
namespace internal {
/** \internal This is the blocked version of fullpivlu_unblocked() */
-template<typename Scalar, int StorageOrder, typename PivIndex>
+template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic>
struct partial_lu_impl
{
- // FIXME add a stride to Map, so that the following mapping becomes easier,
- // another option would be to create an expression being able to automatically
- // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
- // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
- // and Block.
- typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
- typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
- typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+ static const int UnBlockedBound = 16;
+ static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound;
+ static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
+ // Remaining rows and columns at compile-time:
+ static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic;
+ static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic;
+ typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
+ typedef Ref<MatrixType> MatrixTypeRef;
+ typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
typedef typename MatrixType::RealScalar RealScalar;
/** \internal performs the LU decomposition in-place of the matrix \a lu
@@ -362,19 +355,22 @@ struct partial_lu_impl
*
* \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
*/
- static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
+ static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
{
typedef scalar_score_coeff_op<Scalar> Scoring;
typedef typename Scoring::result_type Score;
const Index rows = lu.rows();
const Index cols = lu.cols();
const Index size = (std::min)(rows,cols);
+ // For small compile-time matrices it is worth processing the last row separately:
+ // speedup: +100% for 2x2, +10% for others.
+ const Index endk = UnBlockedAtCompileTime ? size-1 : size;
nb_transpositions = 0;
Index first_zero_pivot = -1;
- for(Index k = 0; k < size; ++k)
+ for(Index k = 0; k < endk; ++k)
{
- Index rrows = rows-k-1;
- Index rcols = cols-k-1;
+ int rrows = internal::convert_index<int>(rows-k-1);
+ int rcols = internal::convert_index<int>(cols-k-1);
Index row_of_biggest_in_col;
Score biggest_in_corner
@@ -391,9 +387,7 @@ struct partial_lu_impl
++nb_transpositions;
}
- // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
- // overflow but not the actual quotient?
- lu.col(k).tail(rrows) /= lu.coeff(k,k);
+ lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k,k);
}
else if(first_zero_pivot==-1)
{
@@ -403,8 +397,18 @@ struct partial_lu_impl
}
if(k<rows-1)
- lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
+ lu.bottomRightCorner(fix<RRows>(rrows),fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
+ }
+
+ // special handling of the last entry
+ if(UnBlockedAtCompileTime)
+ {
+ Index k = endk;
+ row_transpositions[k] = PivIndex(k);
+ if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1)
+ first_zero_pivot = k;
}
+
return first_zero_pivot;
}
@@ -420,18 +424,17 @@ struct partial_lu_impl
* \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
*
* \note This very low level interface using pointers, etc. is to:
- * 1 - reduce the number of instanciations to the strict minimum
- * 2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
+ * 1 - reduce the number of instantiations to the strict minimum
+ * 2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
*/
static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
{
- MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
- MatrixType lu(lu1,0,0,rows,cols);
+ MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride));
const Index size = (std::min)(rows,cols);
// if the matrix is too small, no blocking:
- if(size<=16)
+ if(UnBlockedAtCompileTime || size<=UnBlockedBound)
{
return unblocked_lu(lu, row_transpositions, nb_transpositions);
}
@@ -457,12 +460,12 @@ struct partial_lu_impl
// A00 | A01 | A02
// lu = A_0 | A_1 | A_2 = A10 | A11 | A12
// A20 | A21 | A22
- BlockType A_0(lu,0,0,rows,k);
- BlockType A_2(lu,0,k+bs,rows,tsize);
- BlockType A11(lu,k,k,bs,bs);
- BlockType A12(lu,k,k+bs,bs,tsize);
- BlockType A21(lu,k+bs,k,trows,bs);
- BlockType A22(lu,k+bs,k+bs,trows,tsize);
+ BlockType A_0 = lu.block(0,0,rows,k);
+ BlockType A_2 = lu.block(0,k+bs,rows,tsize);
+ BlockType A11 = lu.block(k,k,bs,bs);
+ BlockType A12 = lu.block(k,k+bs,bs,tsize);
+ BlockType A21 = lu.block(k+bs,k,trows,bs);
+ BlockType A22 = lu.block(k+bs,k+bs,trows,tsize);
PivIndex nb_transpositions_in_panel;
// recursively call the blocked LU algorithm on [A11^T A21^T]^T
@@ -501,11 +504,18 @@ struct partial_lu_impl
template<typename MatrixType, typename TranspositionType>
void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
{
+ // Special-case of zero matrix.
+ if (lu.rows() == 0 || lu.cols() == 0) {
+ nb_transpositions = 0;
+ return;
+ }
eigen_assert(lu.cols() == row_transpositions.size());
- eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+ eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
partial_lu_impl
- <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex>
+ < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
+ typename TranspositionType::StorageIndex,
+ EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)>
::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
}
@@ -519,7 +529,10 @@ void PartialPivLU<MatrixType>::compute()
// the row permutation is stored as int indices, so just to be sure:
eigen_assert(m_lu.rows()<NumTraits<int>::highest());
- m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+ if(m_lu.cols()>0)
+ m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+ else
+ m_l1_norm = RealScalar(0);
eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
const Index size = m_lu.rows();
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
new file mode 100644
index 000000000..a232ffc0a
--- /dev/null
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -0,0 +1,351 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
+// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
+// adjugate of M and determinant of M respectively. M# is computed block-wise
+// using specific formulae. For proof, see:
+// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
+// Variable names are adopted from \src\LU\Inverse_SSE.h.
+//
+// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+// Copyright (c) 2001 Intel Corporation.
+//
+// Permition is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+//
+// TODO: Unify implementations of different data types (i.e. float and double).
+#ifndef EIGEN_INVERSE_SIZE_4_H
+#define EIGEN_INVERSE_SIZE_4_H
+
+namespace Eigen
+{
+namespace internal
+{
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
+{
+ enum
+ {
+ MatrixAlignment = traits<MatrixType>::Alignment,
+ ResultAlignment = traits<ResultType>::Alignment,
+ StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+ };
+ typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
+
+ static void run(const MatrixType &mat, ResultType &result)
+ {
+ ActualMatrixType matrix(mat);
+
+ const float* data = matrix.data();
+ const Index stride = matrix.innerStride();
+ Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
+ Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+ Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+ Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
+
+ // Four 2x2 sub-matrices of the input matrix
+ // input = [[A, B],
+ // [C, D]]
+ Packet4f A, B, C, D;
+
+ if (!StorageOrdersMatch)
+ {
+ A = vec4f_unpacklo(_L1, _L2);
+ B = vec4f_unpacklo(_L3, _L4);
+ C = vec4f_unpackhi(_L1, _L2);
+ D = vec4f_unpackhi(_L3, _L4);
+ }
+ else
+ {
+ A = vec4f_movelh(_L1, _L2);
+ B = vec4f_movehl(_L2, _L1);
+ C = vec4f_movelh(_L3, _L4);
+ D = vec4f_movehl(_L4, _L3);
+ }
+
+ Packet4f AB, DC;
+
+ // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+ AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
+ AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
+
+ // DC = D#*C
+ DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
+ DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
+
+ // determinants of the sub-matrices
+ Packet4f dA, dB, dC, dD;
+
+ dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
+ dA = psub(dA, vec4f_movehl(dA, dA));
+
+ dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
+ dB = psub(dB, vec4f_movehl(dB, dB));
+
+ dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
+ dC = psub(dC, vec4f_movehl(dC, dC));
+
+ dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
+ dD = psub(dD, vec4f_movehl(dD, dD));
+
+ Packet4f d, d1, d2;
+
+ d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
+ d = padd(d, vec4f_movehl(d, d));
+ d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
+ d1 = pmul(dA, dD);
+ d2 = pmul(dB, dC);
+
+ // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+ Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
+
+ // reciprocal of the determinant of the input matrix, rd = 1/det
+ Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
+
+ // Four sub-matrices of the inverse
+ Packet4f iA, iB, iC, iD;
+
+ // iD = D*|A| - C*A#*B
+ iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
+ iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
+ iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
+
+ // iA = A*|D| - B*D#*C
+ iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
+ iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
+ iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
+
+ // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+ iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
+ iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
+ iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
+
+ // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+ iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
+ iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
+ iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
+
+ const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
+ const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
+ rd = pxor(rd, p4f_sign_PNNP);
+ iA = pmul(iA, rd);
+ iB = pmul(iB, rd);
+ iC = pmul(iC, rd);
+ iD = pmul(iD, rd);
+
+ Index res_stride = result.outerStride();
+ float *res = result.data();
+
+ pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
+ pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
+ pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
+ pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
+ }
+};
+
+#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
+// same algorithm as above, except that each operand is split into
+// halves for two registers to hold.
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
+{
+ enum
+ {
+ MatrixAlignment = traits<MatrixType>::Alignment,
+ ResultAlignment = traits<ResultType>::Alignment,
+ StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+ };
+ typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
+ MatrixType const &,
+ typename MatrixType::PlainObject>::type
+ ActualMatrixType;
+
+ static void run(const MatrixType &mat, ResultType &result)
+ {
+ ActualMatrixType matrix(mat);
+
+ // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
+ // row e.g. A1, upper row of A, A2, lower row of A
+ // input = [[A, B], = [[[A1, [B1,
+ // [C, D]] A2], B2]],
+ // [[C1, [D1,
+ // C2], D2]]]
+
+ Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
+
+ const double* data = matrix.data();
+ const Index stride = matrix.innerStride();
+ if (StorageOrdersMatch)
+ {
+ A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+ B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+ A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+ B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+ C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+ D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+ C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+ D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+ }
+ else
+ {
+ Packet2d temp;
+ A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+ C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+ A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+ C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+ temp = A1;
+ A1 = vec2d_unpacklo(A1, A2);
+ A2 = vec2d_unpackhi(temp, A2);
+
+ temp = C1;
+ C1 = vec2d_unpacklo(C1, C2);
+ C2 = vec2d_unpackhi(temp, C2);
+
+ B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+ D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+ B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+ D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
+
+ temp = B1;
+ B1 = vec2d_unpacklo(B1, B2);
+ B2 = vec2d_unpackhi(temp, B2);
+
+ temp = D1;
+ D1 = vec2d_unpacklo(D1, D2);
+ D2 = vec2d_unpackhi(temp, D2);
+ }
+
+ // determinants of the sub-matrices
+ Packet2d dA, dB, dC, dD;
+
+ dA = vec2d_swizzle2(A2, A2, 1);
+ dA = pmul(A1, dA);
+ dA = psub(dA, vec2d_duplane(dA, 1));
+
+ dB = vec2d_swizzle2(B2, B2, 1);
+ dB = pmul(B1, dB);
+ dB = psub(dB, vec2d_duplane(dB, 1));
+
+ dC = vec2d_swizzle2(C2, C2, 1);
+ dC = pmul(C1, dC);
+ dC = psub(dC, vec2d_duplane(dC, 1));
+
+ dD = vec2d_swizzle2(D2, D2, 1);
+ dD = pmul(D1, dD);
+ dD = psub(dD, vec2d_duplane(dD, 1));
+
+ Packet2d DC1, DC2, AB1, AB2;
+
+ // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+ AB1 = pmul(B1, vec2d_duplane(A2, 1));
+ AB2 = pmul(B2, vec2d_duplane(A1, 0));
+ AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
+ AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
+
+ // DC = D#*C
+ DC1 = pmul(C1, vec2d_duplane(D2, 1));
+ DC2 = pmul(C2, vec2d_duplane(D1, 0));
+ DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
+ DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
+
+ Packet2d d1, d2;
+
+ // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+ Packet2d det;
+
+ // reciprocal of the determinant of the input matrix, rd = 1/det
+ Packet2d rd;
+
+ d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
+ d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
+ rd = padd(d1, d2);
+ rd = padd(rd, vec2d_duplane(rd, 1));
+
+ d1 = pmul(dA, dD);
+ d2 = pmul(dB, dC);
+
+ det = padd(d1, d2);
+ det = psub(det, rd);
+ det = vec2d_duplane(det, 0);
+ rd = pdiv(pset1<Packet2d>(1.0), det);
+
+ // rows of four sub-matrices of the inverse
+ Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
+
+ // iD = D*|A| - C*A#*B
+ iD1 = pmul(AB1, vec2d_duplane(C1, 0));
+ iD2 = pmul(AB1, vec2d_duplane(C2, 0));
+ iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
+ iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
+ dA = vec2d_duplane(dA, 0);
+ iD1 = psub(pmul(D1, dA), iD1);
+ iD2 = psub(pmul(D2, dA), iD2);
+
+ // iA = A*|D| - B*D#*C
+ iA1 = pmul(DC1, vec2d_duplane(B1, 0));
+ iA2 = pmul(DC1, vec2d_duplane(B2, 0));
+ iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
+ iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
+ dD = vec2d_duplane(dD, 0);
+ iA1 = psub(pmul(A1, dD), iA1);
+ iA2 = psub(pmul(A2, dD), iA2);
+
+ // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+ iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
+ iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
+ iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
+ iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
+ dB = vec2d_duplane(dB, 0);
+ iB1 = psub(pmul(C1, dB), iB1);
+ iB2 = psub(pmul(C2, dB), iB2);
+
+ // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+ iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
+ iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
+ iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
+ iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
+ dC = vec2d_duplane(dC, 0);
+ iC1 = psub(pmul(B1, dC), iC1);
+ iC2 = psub(pmul(B2, dC), iC2);
+
+ const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
+ const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
+ const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
+ const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
+ d1 = pxor(rd, sign_PN);
+ d2 = pxor(rd, sign_NP);
+
+ Index res_stride = result.outerStride();
+ double *res = result.data();
+ pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
+ pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
+ pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
+ pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
+ pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
+ pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
+ pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
+ pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
+ }
+};
+#endif
+} // namespace internal
+} // namespace Eigen
+#endif
diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h
deleted file mode 100644
index ebb64a62b..000000000
--- a/Eigen/src/LU/arch/Inverse_SSE.h
+++ /dev/null
@@ -1,338 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2001 Intel Corporation
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// The SSE code for the 4x4 float and double matrix inverse in this file
-// comes from the following Intel's library:
-// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
-//
-// Here is the respective copyright and license statement:
-//
-// Copyright (c) 2001 Intel Corporation.
-//
-// Permition is granted to use, copy, distribute and prepare derivative works
-// of this library for any purpose and without fee, provided, that the above
-// copyright notice and this statement appear in all copies.
-// Intel makes no representations about the suitability of this software for
-// any purpose, and specifically disclaims all warranties.
-// See LEGAL.TXT for all the legal information.
-
-#ifndef EIGEN_INVERSE_SSE_H
-#define EIGEN_INVERSE_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
-{
- enum {
- MatrixAlignment = traits<MatrixType>::Alignment,
- ResultAlignment = traits<ResultType>::Alignment,
- StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
- };
- typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-
- static void run(const MatrixType& mat, ResultType& result)
- {
- ActualMatrixType matrix(mat);
- EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
-
- // Load the full matrix into registers
- __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
- __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
- __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
- __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
-
- // The inverse is calculated using "Divide and Conquer" technique. The
- // original matrix is divide into four 2x2 sub-matrices. Since each
- // register holds four matrix element, the smaller matrices are
- // represented as a registers. Hence we get a better locality of the
- // calculations.
-
- __m128 A, B, C, D; // the four sub-matrices
- if(!StorageOrdersMatch)
- {
- A = _mm_unpacklo_ps(_L1, _L2);
- B = _mm_unpacklo_ps(_L3, _L4);
- C = _mm_unpackhi_ps(_L1, _L2);
- D = _mm_unpackhi_ps(_L3, _L4);
- }
- else
- {
- A = _mm_movelh_ps(_L1, _L2);
- B = _mm_movehl_ps(_L2, _L1);
- C = _mm_movelh_ps(_L3, _L4);
- D = _mm_movehl_ps(_L4, _L3);
- }
-
- __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices
- DC, AB;
- __m128 dA, dB, dC, dD; // determinant of the sub-matrices
- __m128 det, d, d1, d2;
- __m128 rd; // reciprocal of the determinant
-
- // AB = A# * B
- AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
- AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
- // DC = D# * C
- DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
- DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
-
- // dA = |A|
- dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
- dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
- // dB = |B|
- dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
- dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
-
- // dC = |C|
- dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
- dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
- // dD = |D|
- dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
- dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
-
- // d = trace(AB*DC) = trace(A#*B*D#*C)
- d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
-
- // iD = C*A#*B
- iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
- iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
- // iA = B*D#*C
- iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
- iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
-
- // d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
- d = _mm_add_ps(d, _mm_movehl_ps(d, d));
- d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
- d1 = _mm_mul_ss(dA,dD);
- d2 = _mm_mul_ss(dB,dC);
-
- // iD = D*|A| - C*A#*B
- iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
-
- // iA = A*|D| - B*D#*C;
- iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
-
- // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
- det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
- rd = _mm_div_ss(_mm_set_ss(1.0f), det);
-
-// #ifdef ZERO_SINGULAR
-// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
-// #endif
-
- // iB = D * (A#B)# = D*B#*A
- iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
- iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
- // iC = A * (D#C)# = A*C#*D
- iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
- iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
-
- rd = _mm_shuffle_ps(rd,rd,0);
- rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
-
- // iB = C*|B| - D*B#*A
- iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
-
- // iC = B*|C| - A*C#*D;
- iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
-
- // iX = iX / det
- iA = _mm_mul_ps(rd,iA);
- iB = _mm_mul_ps(rd,iB);
- iC = _mm_mul_ps(rd,iC);
- iD = _mm_mul_ps(rd,iD);
-
- Index res_stride = result.outerStride();
- float* res = result.data();
- pstoret<float, Packet4f, ResultAlignment>(res+0, _mm_shuffle_ps(iA,iB,0x77));
- pstoret<float, Packet4f, ResultAlignment>(res+res_stride, _mm_shuffle_ps(iA,iB,0x22));
- pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77));
- pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22));
- }
-
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
-{
- enum {
- MatrixAlignment = traits<MatrixType>::Alignment,
- ResultAlignment = traits<ResultType>::Alignment,
- StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
- };
- typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType;
-
- static void run(const MatrixType& mat, ResultType& result)
- {
- ActualMatrixType matrix(mat);
- const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
- const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-
- // The inverse is calculated using "Divide and Conquer" technique. The
- // original matrix is divide into four 2x2 sub-matrices. Since each
- // register of the matrix holds two elements, the smaller matrices are
- // consisted of two registers. Hence we get a better locality of the
- // calculations.
-
- // the four sub-matrices
- __m128d A1, A2, B1, B2, C1, C2, D1, D2;
-
- if(StorageOrdersMatch)
- {
- A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
- A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
- C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
- C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
- }
- else
- {
- __m128d tmp;
- A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
- A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
- tmp = A1;
- A1 = _mm_unpacklo_pd(A1,A2);
- A2 = _mm_unpackhi_pd(tmp,A2);
- tmp = C1;
- C1 = _mm_unpacklo_pd(C1,C2);
- C2 = _mm_unpackhi_pd(tmp,C2);
-
- B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
- B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
- tmp = B1;
- B1 = _mm_unpacklo_pd(B1,B2);
- B2 = _mm_unpackhi_pd(tmp,B2);
- tmp = D1;
- D1 = _mm_unpacklo_pd(D1,D2);
- D2 = _mm_unpackhi_pd(tmp,D2);
- }
-
- __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices
- DC1, DC2, AB1, AB2;
- __m128d dA, dB, dC, dD; // determinant of the sub-matrices
- __m128d det, d1, d2, rd;
-
- // dA = |A|
- dA = _mm_shuffle_pd(A2, A2, 1);
- dA = _mm_mul_pd(A1, dA);
- dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
- // dB = |B|
- dB = _mm_shuffle_pd(B2, B2, 1);
- dB = _mm_mul_pd(B1, dB);
- dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
-
- // AB = A# * B
- AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
- AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
- AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
- AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
-
- // dC = |C|
- dC = _mm_shuffle_pd(C2, C2, 1);
- dC = _mm_mul_pd(C1, dC);
- dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
- // dD = |D|
- dD = _mm_shuffle_pd(D2, D2, 1);
- dD = _mm_mul_pd(D1, dD);
- dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
-
- // DC = D# * C
- DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
- DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
- DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
- DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
-
- // rd = trace(AB*DC) = trace(A#*B*D#*C)
- d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
- d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
- rd = _mm_add_pd(d1, d2);
- rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
-
- // iD = C*A#*B
- iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
- iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
- iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
- iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
-
- // iA = B*D#*C
- iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
- iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
- iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
- iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
-
- // iD = D*|A| - C*A#*B
- dA = _mm_shuffle_pd(dA,dA,0);
- iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
- iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
-
- // iA = A*|D| - B*D#*C;
- dD = _mm_shuffle_pd(dD,dD,0);
- iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
- iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
-
- d1 = _mm_mul_sd(dA, dD);
- d2 = _mm_mul_sd(dB, dC);
-
- // iB = D * (A#B)# = D*B#*A
- iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
- iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
- iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
- iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
-
- // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
- det = _mm_add_sd(d1, d2);
- det = _mm_sub_sd(det, rd);
-
- // iC = A * (D#C)# = A*C#*D
- iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
- iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
- iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
- iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
-
- rd = _mm_div_sd(_mm_set_sd(1.0), det);
-// #ifdef ZERO_SINGULAR
-// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
-// #endif
- rd = _mm_shuffle_pd(rd,rd,0);
-
- // iB = C*|B| - D*B#*A
- dB = _mm_shuffle_pd(dB,dB,0);
- iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
- iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
-
- d1 = _mm_xor_pd(rd, _Sign_PN);
- d2 = _mm_xor_pd(rd, _Sign_NP);
-
- // iC = B*|C| - A*C#*D;
- dC = _mm_shuffle_pd(dC,dC,0);
- iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
- iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
-
- Index res_stride = result.outerStride();
- double* res = result.data();
- pstoret<double, Packet2d, ResultAlignment>(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));
- pstoret<double, Packet2d, ResultAlignment>(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
- pstoret<double, Packet2d, ResultAlignment>(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));
- pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
- pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));
- pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
- pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));
- pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
- }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_SSE_H