diff options
Diffstat (limited to 'Eigen/src/LU')
-rw-r--r-- | Eigen/src/LU/Determinant.h | 54 | ||||
-rw-r--r-- | Eigen/src/LU/FullPivLU.h | 72 | ||||
-rw-r--r-- | Eigen/src/LU/InverseImpl.h | 35 | ||||
-rw-r--r-- | Eigen/src/LU/PartialPivLU.h | 129 | ||||
-rw-r--r-- | Eigen/src/LU/arch/InverseSize4.h | 351 | ||||
-rw-r--r-- | Eigen/src/LU/arch/Inverse_SSE.h | 338 |
6 files changed, 512 insertions, 467 deletions
diff --git a/Eigen/src/LU/Determinant.h b/Eigen/src/LU/Determinant.h index d6a3c1e5a..3a41e6fcb 100644 --- a/Eigen/src/LU/Determinant.h +++ b/Eigen/src/LU/Determinant.h @@ -15,6 +15,7 @@ namespace Eigen { namespace internal { template<typename Derived> +EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase<Derived>& matrix, int a, int b, int c) { @@ -22,14 +23,6 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } -template<typename Derived> -const typename Derived::Scalar bruteforce_det4_helper -(const MatrixBase<Derived>& matrix, int j, int k, int m, int n) -{ - return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) - * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); -} - template<typename Derived, int DeterminantType = Derived::RowsAtCompileTime > struct determinant_impl @@ -44,7 +37,8 @@ template<typename Derived, template<typename Derived> struct determinant_impl<Derived, 1> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -52,7 +46,8 @@ template<typename Derived> struct determinant_impl<Derived, 1> template<typename Derived> struct determinant_impl<Derived, 2> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -60,7 +55,8 @@ template<typename Derived> struct determinant_impl<Derived, 2> template<typename Derived> struct determinant_impl<Derived, 3> { - static inline typename traits<Derived>::Scalar run(const Derived& m) + static inline EIGEN_DEVICE_FUNC + typename traits<Derived>::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -70,15 +66,34 @@ template<typename Derived> struct determinant_impl<Derived, 3> template<typename Derived> struct determinant_impl<Derived, 4> { - static typename traits<Derived>::Scalar run(const Derived& m) + typedef typename traits<Derived>::Scalar Scalar; + static EIGEN_DEVICE_FUNC + Scalar run(const Derived& m) + { + Scalar d2_01 = det2(m, 0, 1); + Scalar d2_02 = det2(m, 0, 2); + Scalar d2_03 = det2(m, 0, 3); + Scalar d2_12 = det2(m, 1, 2); + Scalar d2_13 = det2(m, 1, 3); + Scalar d2_23 = det2(m, 2, 3); + Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); + Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); + Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); + Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); + return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + + internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); + } +protected: + static EIGEN_DEVICE_FUNC + Scalar det2(const Derived& m, Index i0, Index i1) + { + return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); + } + + static EIGEN_DEVICE_FUNC + Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) { - // trick by Martin Costabel to compute 4x4 det with only 30 muls - return bruteforce_det4_helper(m,0,1,2,3) - - bruteforce_det4_helper(m,0,2,1,3) - + bruteforce_det4_helper(m,0,3,1,2) - + bruteforce_det4_helper(m,1,2,0,3) - - bruteforce_det4_helper(m,1,3,0,2) - + bruteforce_det4_helper(m,2,3,0,1); + return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); } }; @@ -89,6 +104,7 @@ template<typename Derived> struct determinant_impl<Derived, 4> * \returns the determinant of this matrix */ template<typename Derived> +EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const { eigen_assert(rows() == cols()); diff --git a/Eigen/src/LU/FullPivLU.h b/Eigen/src/LU/FullPivLU.h index 03b6af706..ba1749fa6 100644 --- a/Eigen/src/LU/FullPivLU.h +++ b/Eigen/src/LU/FullPivLU.h @@ -18,6 +18,7 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; enum { Flags = 0 }; }; @@ -48,12 +49,12 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an exemple, here is how the original matrix can be retrieved: + * As an example, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template<typename _MatrixType> class FullPivLU @@ -62,9 +63,9 @@ template<typename _MatrixType> class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase<FullPivLU> Base; + friend class SolverBase<FullPivLU>; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -218,6 +219,7 @@ template<typename _MatrixType> class FullPivLU return internal::image_retval<FullPivLU>(*this, originalMatrix); } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -237,14 +239,10 @@ template<typename _MatrixType> class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template<typename Rhs> inline const Solve<FullPivLU, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "LU is not initialized."); - return Solve<FullPivLU, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -320,7 +318,7 @@ template<typename _MatrixType> class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize(); + : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize()); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -406,16 +404,16 @@ template<typename _MatrixType> class FullPivLU MatrixType reconstructedMatrix() const; - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lu.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_lu.cols(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> - EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template<bool Conjugate, typename RhsType, typename DstType> - EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif @@ -531,8 +529,8 @@ void FullPivLU<MatrixType>::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = i; - m_colsTranspositions.coeffRef(i) = i; + m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); + m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i); } break; } @@ -543,8 +541,8 @@ void FullPivLU<MatrixType>::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; - m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; + m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner); + m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner); if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -757,7 +755,6 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -807,7 +804,6 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); - eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -821,29 +817,19 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - if (Conjugate) { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView<Upper>() - .adjoint() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView<UnitLower>() - .adjoint() - .solveInPlace(c.topRows(smalldim)); - } else { - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView<Upper>() - .transpose() - .solveInPlace(c.topRows(nonzero_pivots)); - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView<UnitLower>() - .transpose() - .solveInPlace(c.topRows(smalldim)); - } + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView<Upper>() + .transpose() + .template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(nonzero_pivots)); + + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView<UnitLower>() + .transpose() + .template conjugateIf<Conjugate>() + .solveInPlace(c.topRows(smalldim)); // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 018f99b58..a40cefa9e 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper( const MatrixType& matrix, const typename ResultType::Scalar& invdet, ResultType& result) { + typename ResultType::Scalar temp = matrix.coeff(0,0); result.coeffRef(0,0) = matrix.coeff(1,1) * invdet; result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet; result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet; - result.coeffRef(1,1) = matrix.coeff(0,0) * invdet; + result.coeffRef(1,1) = temp * invdet; } template<typename MatrixType, typename ResultType> @@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3<MatrixType,0,1>(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3<MatrixType,1,1>(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3<MatrixType,0,1>(matrix) * invdet; + const Scalar c11 = cofactor_3x3<MatrixType,1,1>(matrix) * invdet; + const Scalar c02 = cofactor_3x3<MatrixType,0,2>(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3<MatrixType,2,1>(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3<MatrixType,0,2>(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3<MatrixType,1,2>(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3<MatrixType,2,2>(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template<typename MatrixType, typename ResultType> @@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3> bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix<Scalar,3,1> cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3<MatrixType,0,0>(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3<MatrixType,1,0>(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3<MatrixType,2,0>(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); @@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4> using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse<MatrixType, ResultType>::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse); + } } }; @@ -290,6 +301,7 @@ template<typename DstXprType, typename XprType> struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense> { typedef Inverse<XprType> SrcXprType; + EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &) { Index dstRows = src.rows(); @@ -332,6 +344,7 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst * \sa computeInverseAndDetWithCheck() */ template<typename Derived> +EIGEN_DEVICE_FUNC inline const Inverse<Derived> MatrixBase<Derived>::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) @@ -345,6 +358,8 @@ inline const Inverse<Derived> MatrixBase<Derived>::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -385,6 +400,8 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. @@ -404,7 +421,7 @@ inline void MatrixBase<Derived>::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index d43961887..34aed7249 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -19,6 +19,7 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; + typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -69,7 +70,7 @@ struct enable_if_ref<Ref<T>,Derived> { * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP(). * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU */ template<typename _MatrixType> class PartialPivLU @@ -79,8 +80,9 @@ template<typename _MatrixType> class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase<PartialPivLU> Base; + friend class SolverBase<PartialPivLU>; + EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) - // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -152,6 +154,7 @@ template<typename _MatrixType> class PartialPivLU return m_p; } + #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -169,14 +172,10 @@ template<typename _MatrixType> class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ - // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template<typename Rhs> inline const Solve<PartialPivLU, Rhs> - solve(const MatrixBase<Rhs>& b) const - { - eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); - return Solve<PartialPivLU, Rhs>(*this, b.derived()); - } + solve(const MatrixBase<Rhs>& b) const; + #endif /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -217,8 +216,8 @@ template<typename _MatrixType> class PartialPivLU MatrixType reconstructedMatrix() const; - inline Index rows() const { return m_lu.rows(); } - inline Index cols() const { return m_lu.cols(); } + EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return m_lu.rows(); } + EIGEN_CONSTEXPR inline Index cols() const EIGEN_NOEXCEPT { return m_lu.cols(); } #ifndef EIGEN_PARSED_BY_DOXYGEN template<typename RhsType, typename DstType> @@ -231,8 +230,6 @@ template<typename _MatrixType> class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ - eigen_assert(rhs.rows() == m_lu.rows()); - // Step 1 dst = permutationP() * rhs; @@ -246,26 +243,21 @@ template<typename _MatrixType> class PartialPivLU template<bool Conjugate, typename RhsType, typename DstType> EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. + /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. * So we proceed as follows: - * Step 1: compute c = Pb. - * Step 2: replace c by the solution x to Lx = c. - * Step 3: replace c by the solution x to Ux = c. + * Step 1: compute c as the solution to L^T c = b + * Step 2: replace c by the solution x to U^T x = c. + * Step 3: update c = P^-1 c. */ eigen_assert(rhs.rows() == m_lu.cols()); - if (Conjugate) { - // Step 1 - dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs); - // Step 2 - m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst); - } else { - // Step 1 - dst = m_lu.template triangularView<Upper>().transpose().solve(rhs); - // Step 2 - m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst); - } + // Step 1 + dst = m_lu.template triangularView<Upper>().transpose() + .template conjugateIf<Conjugate>().solve(rhs); + // Step 2 + m_lu.template triangularView<UnitLower>().transpose() + .template conjugateIf<Conjugate>().solveInPlace(dst); // Step 3 dst = permutationP().transpose() * dst; } @@ -339,17 +331,18 @@ PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template<typename Scalar, int StorageOrder, typename PivIndex> +template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic> struct partial_lu_impl { - // FIXME add a stride to Map, so that the following mapping becomes easier, - // another option would be to create an expression being able to automatically - // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly - // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, - // and Block. - typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU; - typedef Block<MapLU, Dynamic, Dynamic> MatrixType; - typedef Block<MatrixType,Dynamic,Dynamic> BlockType; + static const int UnBlockedBound = 16; + static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; + static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; + // Remaining rows and columns at compile-time: + static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; + static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; + typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType; + typedef Ref<MatrixType> MatrixTypeRef; + typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -362,19 +355,22 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op<Scalar> Scoring; typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); + // For small compile-time matrices it is worth processing the last row separately: + // speedup: +100% for 2x2, +10% for others. + const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < size; ++k) + for(Index k = 0; k < endk; ++k) { - Index rrows = rows-k-1; - Index rcols = cols-k-1; + int rrows = internal::convert_index<int>(rows-k-1); + int rcols = internal::convert_index<int>(cols-k-1); Index row_of_biggest_in_col; Score biggest_in_corner @@ -391,9 +387,7 @@ struct partial_lu_impl ++nb_transpositions; } - // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) - // overflow but not the actual quotient? - lu.col(k).tail(rrows) /= lu.coeff(k,k); + lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -403,8 +397,18 @@ struct partial_lu_impl } if(k<rows-1) - lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols); + lu.bottomRightCorner(fix<RRows>(rrows),fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols)); + } + + // special handling of the last entry + if(UnBlockedAtCompileTime) + { + Index k = endk; + row_transpositions[k] = PivIndex(k); + if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) + first_zero_pivot = k; } + return first_zero_pivot; } @@ -420,18 +424,17 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instanciations to the strict minimum - * 2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > > + * 1 - reduce the number of instantiations to the strict minimum + * 2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); - MatrixType lu(lu1,0,0,rows,cols); + MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(size<=16) + if(UnBlockedAtCompileTime || size<=UnBlockedBound) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -457,12 +460,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0(lu,0,0,rows,k); - BlockType A_2(lu,0,k+bs,rows,tsize); - BlockType A11(lu,k,k,bs,bs); - BlockType A12(lu,k,k+bs,bs,tsize); - BlockType A21(lu,k+bs,k,trows,bs); - BlockType A22(lu,k+bs,k+bs,trows,tsize); + BlockType A_0 = lu.block(0,0,rows,k); + BlockType A_2 = lu.block(0,k+bs,rows,tsize); + BlockType A11 = lu.block(k,k,bs,bs); + BlockType A12 = lu.block(k,k+bs,bs,tsize); + BlockType A21 = lu.block(k+bs,k,trows,bs); + BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -501,11 +504,18 @@ struct partial_lu_impl template<typename MatrixType, typename TranspositionType> void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex> + < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, + typename TranspositionType::StorageIndex, + EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } @@ -519,7 +529,10 @@ void PartialPivLU<MatrixType>::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()<NumTraits<int>::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h new file mode 100644 index 000000000..a232ffc0a --- /dev/null +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -0,0 +1,351 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2001 Intel Corporation +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath. +// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M, +// adjugate of M and determinant of M respectively. M# is computed block-wise +// using specific formulae. For proof, see: +// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html +// Variable names are adopted from \src\LU\Inverse_SSE.h. +// +// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h +// comes from the following Intel's library: +// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ +// +// Here is the respective copyright and license statement: +// +// Copyright (c) 2001 Intel Corporation. +// +// Permition is granted to use, copy, distribute and prepare derivative works +// of this library for any purpose and without fee, provided, that the above +// copyright notice and this statement appear in all copies. +// Intel makes no representations about the suitability of this software for +// any purpose, and specifically disclaims all warranties. +// See LEGAL.TXT for all the legal information. +// +// TODO: Unify implementations of different data types (i.e. float and double). +#ifndef EIGEN_INVERSE_SIZE_4_H +#define EIGEN_INVERSE_SIZE_4_H + +namespace Eigen +{ +namespace internal +{ +template <typename MatrixType, typename ResultType> +struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType> +{ + enum + { + MatrixAlignment = traits<MatrixType>::Alignment, + ResultAlignment = traits<ResultType>::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data); + Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4); + Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8); + Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12); + + // Four 2x2 sub-matrices of the input matrix + // input = [[A, B], + // [C, D]] + Packet4f A, B, C, D; + + if (!StorageOrdersMatch) + { + A = vec4f_unpacklo(_L1, _L2); + B = vec4f_unpacklo(_L3, _L4); + C = vec4f_unpackhi(_L1, _L2); + D = vec4f_unpackhi(_L3, _L4); + } + else + { + A = vec4f_movelh(_L1, _L2); + B = vec4f_movehl(_L2, _L1); + C = vec4f_movelh(_L3, _L4); + D = vec4f_movehl(_L4, _L3); + } + + Packet4f AB, DC; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B); + AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1))); + + // DC = D#*C + DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C); + DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1))); + + // determinants of the sub-matrices + Packet4f dA, dB, dC, dD; + + dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A); + dA = psub(dA, vec4f_movehl(dA, dA)); + + dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B); + dB = psub(dB, vec4f_movehl(dB, dB)); + + dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C); + dC = psub(dC, vec4f_movehl(dC, dC)); + + dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D); + dD = psub(dD, vec4f_movehl(dD, dD)); + + Packet4f d, d1, d2; + + d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB); + d = padd(d, vec4f_movehl(d, d)); + d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0)); + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0); + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det); + + // Four sub-matrices of the inverse + Packet4f iA, iB, iC, iD; + + // iD = D*|A| - C*A#*B + iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB)); + iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB))); + iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD); + + // iA = A*|D| - B*D#*C + iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC)); + iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC))); + iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0)); + iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1))); + iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0)); + iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1))); + iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC); + + const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f}; + const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask); + rd = pxor(rd, p4f_sign_PNNP); + iA = pmul(iA, rd); + iB = pmul(iB, rd); + iC = pmul(iC, rd); + iD = pmul(iD, rd); + + Index res_stride = result.outerStride(); + float *res = result.data(); + + pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1)); + pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0)); + pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1)); + pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0)); + } +}; + +#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG)) +// same algorithm as above, except that each operand is split into +// halves for two registers to hold. +template <typename MatrixType, typename ResultType> +struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType> +{ + enum + { + MatrixAlignment = traits<MatrixType>::Alignment, + ResultAlignment = traits<ResultType>::Alignment, + StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit) + }; + typedef typename conditional<(MatrixType::Flags & LinearAccessBit), + MatrixType const &, + typename MatrixType::PlainObject>::type + ActualMatrixType; + + static void run(const MatrixType &mat, ResultType &result) + { + ActualMatrixType matrix(mat); + + // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower + // row e.g. A1, upper row of A, A2, lower row of A + // input = [[A, B], = [[[A1, [B1, + // [C, D]] A2], B2]], + // [[C1, [D1, + // C2], D2]]] + + Packet2d A1, A2, B1, B2, C1, C2, D1, D2; + + const double* data = matrix.data(); + const Index stride = matrix.innerStride(); + if (StorageOrdersMatch) + { + A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0); + B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2); + A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4); + B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6); + C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8); + D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10); + C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12); + D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14); + } + else + { + Packet2d temp; + A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0); + C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2); + A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4); + C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6); + temp = A1; + A1 = vec2d_unpacklo(A1, A2); + A2 = vec2d_unpackhi(temp, A2); + + temp = C1; + C1 = vec2d_unpacklo(C1, C2); + C2 = vec2d_unpackhi(temp, C2); + + B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8); + D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10); + B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12); + D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14); + + temp = B1; + B1 = vec2d_unpacklo(B1, B2); + B2 = vec2d_unpackhi(temp, B2); + + temp = D1; + D1 = vec2d_unpacklo(D1, D2); + D2 = vec2d_unpackhi(temp, D2); + } + + // determinants of the sub-matrices + Packet2d dA, dB, dC, dD; + + dA = vec2d_swizzle2(A2, A2, 1); + dA = pmul(A1, dA); + dA = psub(dA, vec2d_duplane(dA, 1)); + + dB = vec2d_swizzle2(B2, B2, 1); + dB = pmul(B1, dB); + dB = psub(dB, vec2d_duplane(dB, 1)); + + dC = vec2d_swizzle2(C2, C2, 1); + dC = pmul(C1, dC); + dC = psub(dC, vec2d_duplane(dC, 1)); + + dD = vec2d_swizzle2(D2, D2, 1); + dD = pmul(D1, dD); + dD = psub(dD, vec2d_duplane(dD, 1)); + + Packet2d DC1, DC2, AB1, AB2; + + // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product. + AB1 = pmul(B1, vec2d_duplane(A2, 1)); + AB2 = pmul(B2, vec2d_duplane(A1, 0)); + AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1))); + AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0))); + + // DC = D#*C + DC1 = pmul(C1, vec2d_duplane(D2, 1)); + DC2 = pmul(C2, vec2d_duplane(D1, 0)); + DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1))); + DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0))); + + Packet2d d1, d2; + + // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C) + Packet2d det; + + // reciprocal of the determinant of the input matrix, rd = 1/det + Packet2d rd; + + d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0)); + d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3)); + rd = padd(d1, d2); + rd = padd(rd, vec2d_duplane(rd, 1)); + + d1 = pmul(dA, dD); + d2 = pmul(dB, dC); + + det = padd(d1, d2); + det = psub(det, rd); + det = vec2d_duplane(det, 0); + rd = pdiv(pset1<Packet2d>(1.0), det); + + // rows of four sub-matrices of the inverse + Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2; + + // iD = D*|A| - C*A#*B + iD1 = pmul(AB1, vec2d_duplane(C1, 0)); + iD2 = pmul(AB1, vec2d_duplane(C2, 0)); + iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1))); + iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1))); + dA = vec2d_duplane(dA, 0); + iD1 = psub(pmul(D1, dA), iD1); + iD2 = psub(pmul(D2, dA), iD2); + + // iA = A*|D| - B*D#*C + iA1 = pmul(DC1, vec2d_duplane(B1, 0)); + iA2 = pmul(DC1, vec2d_duplane(B2, 0)); + iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1))); + iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1))); + dD = vec2d_duplane(dD, 0); + iA1 = psub(pmul(A1, dD), iA1); + iA2 = psub(pmul(A2, dD), iA2); + + // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A + iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1)); + iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1)); + iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2))); + iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2))); + dB = vec2d_duplane(dB, 0); + iB1 = psub(pmul(C1, dB), iB1); + iB2 = psub(pmul(C2, dB), iB2); + + // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D + iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1)); + iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1)); + iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2))); + iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2))); + dC = vec2d_duplane(dC, 0); + iC1 = psub(pmul(B1, dC), iC1); + iC2 = psub(pmul(B2, dC), iC2); + + const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)}; + const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0}; + const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1); + const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2); + d1 = pxor(rd, sign_PN); + d2 = pxor(rd, sign_NP); + + Index res_stride = result.outerStride(); + double *res = result.data(); + pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2)); + pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1)); + pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2)); + } +}; +#endif +} // namespace internal +} // namespace Eigen +#endif diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h deleted file mode 100644 index ebb64a62b..000000000 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ /dev/null @@ -1,338 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2001 Intel Corporation -// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> -// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com> -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// The SSE code for the 4x4 float and double matrix inverse in this file -// comes from the following Intel's library: -// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/ -// -// Here is the respective copyright and license statement: -// -// Copyright (c) 2001 Intel Corporation. -// -// Permition is granted to use, copy, distribute and prepare derivative works -// of this library for any purpose and without fee, provided, that the above -// copyright notice and this statement appear in all copies. -// Intel makes no representations about the suitability of this software for -// any purpose, and specifically disclaims all warranties. -// See LEGAL.TXT for all the legal information. - -#ifndef EIGEN_INVERSE_SSE_H -#define EIGEN_INVERSE_SSE_H - -namespace Eigen { - -namespace internal { - -template<typename MatrixType, typename ResultType> -struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType> -{ - enum { - MatrixAlignment = traits<MatrixType>::Alignment, - ResultAlignment = traits<ResultType>::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; - - // Load the full matrix into registers - __m128 _L1 = matrix.template packet<MatrixAlignment>( 0); - __m128 _L2 = matrix.template packet<MatrixAlignment>( 4); - __m128 _L3 = matrix.template packet<MatrixAlignment>( 8); - __m128 _L4 = matrix.template packet<MatrixAlignment>(12); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register holds four matrix element, the smaller matrices are - // represented as a registers. Hence we get a better locality of the - // calculations. - - __m128 A, B, C, D; // the four sub-matrices - if(!StorageOrdersMatch) - { - A = _mm_unpacklo_ps(_L1, _L2); - B = _mm_unpacklo_ps(_L3, _L4); - C = _mm_unpackhi_ps(_L1, _L2); - D = _mm_unpackhi_ps(_L3, _L4); - } - else - { - A = _mm_movelh_ps(_L1, _L2); - B = _mm_movehl_ps(_L2, _L1); - C = _mm_movelh_ps(_L3, _L4); - D = _mm_movehl_ps(_L4, _L3); - } - - __m128 iA, iB, iC, iD, // partial inverse of the sub-matrices - DC, AB; - __m128 dA, dB, dC, dD; // determinant of the sub-matrices - __m128 det, d, d1, d2; - __m128 rd; // reciprocal of the determinant - - // AB = A# * B - AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B); - AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E))); - // DC = D# * C - DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C); - DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E))); - - // dA = |A| - dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A); - dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA)); - // dB = |B| - dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B); - dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB)); - - // dC = |C| - dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C); - dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC)); - // dD = |D| - dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D); - dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD)); - - // d = trace(AB*DC) = trace(A#*B*D#*C) - d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB); - - // iD = C*A#*B - iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB)); - iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB))); - // iA = B*D#*C - iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC)); - iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC))); - - // d = trace(AB*DC) = trace(A#*B*D#*C) [continue] - d = _mm_add_ps(d, _mm_movehl_ps(d, d)); - d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1)); - d1 = _mm_mul_ss(dA,dD); - d2 = _mm_mul_ss(dB,dC); - - // iD = D*|A| - C*A#*B - iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD); - - // iA = A*|D| - B*D#*C; - iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_sub_ss(_mm_add_ss(d1,d2),d); - rd = _mm_div_ss(_mm_set_ss(1.0f), det); - -// #ifdef ZERO_SINGULAR -// rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd); -// #endif - - // iB = D * (A#B)# = D*B#*A - iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33)); - iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66))); - // iC = A * (D#C)# = A*C#*D - iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33)); - iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); - - rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); - - // iB = C*|B| - D*B#*A - iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); - - // iC = B*|C| - A*C#*D; - iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC); - - // iX = iX / det - iA = _mm_mul_ps(rd,iA); - iB = _mm_mul_ps(rd,iB); - iC = _mm_mul_ps(rd,iC); - iD = _mm_mul_ps(rd,iD); - - Index res_stride = result.outerStride(); - float* res = result.data(); - pstoret<float, Packet4f, ResultAlignment>(res+0, _mm_shuffle_ps(iA,iB,0x77)); - pstoret<float, Packet4f, ResultAlignment>(res+res_stride, _mm_shuffle_ps(iA,iB,0x22)); - pstoret<float, Packet4f, ResultAlignment>(res+2*res_stride, _mm_shuffle_ps(iC,iD,0x77)); - pstoret<float, Packet4f, ResultAlignment>(res+3*res_stride, _mm_shuffle_ps(iC,iD,0x22)); - } - -}; - -template<typename MatrixType, typename ResultType> -struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType> -{ - enum { - MatrixAlignment = traits<MatrixType>::Alignment, - ResultAlignment = traits<ResultType>::Alignment, - StorageOrdersMatch = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit) - }; - typedef typename conditional<(MatrixType::Flags&LinearAccessBit),MatrixType const &,typename MatrixType::PlainObject>::type ActualMatrixType; - - static void run(const MatrixType& mat, ResultType& result) - { - ActualMatrixType matrix(mat); - const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); - const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - - // The inverse is calculated using "Divide and Conquer" technique. The - // original matrix is divide into four 2x2 sub-matrices. Since each - // register of the matrix holds two elements, the smaller matrices are - // consisted of two registers. Hence we get a better locality of the - // calculations. - - // the four sub-matrices - __m128d A1, A2, B1, B2, C1, C2, D1, D2; - - if(StorageOrdersMatch) - { - A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2); - A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6); - C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); - C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); - } - else - { - __m128d tmp; - A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2); - A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6); - tmp = A1; - A1 = _mm_unpacklo_pd(A1,A2); - A2 = _mm_unpackhi_pd(tmp,A2); - tmp = C1; - C1 = _mm_unpacklo_pd(C1,C2); - C2 = _mm_unpackhi_pd(tmp,C2); - - B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10); - B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14); - tmp = B1; - B1 = _mm_unpacklo_pd(B1,B2); - B2 = _mm_unpackhi_pd(tmp,B2); - tmp = D1; - D1 = _mm_unpacklo_pd(D1,D2); - D2 = _mm_unpackhi_pd(tmp,D2); - } - - __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2, // partial invese of the sub-matrices - DC1, DC2, AB1, AB2; - __m128d dA, dB, dC, dD; // determinant of the sub-matrices - __m128d det, d1, d2, rd; - - // dA = |A| - dA = _mm_shuffle_pd(A2, A2, 1); - dA = _mm_mul_pd(A1, dA); - dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3)); - // dB = |B| - dB = _mm_shuffle_pd(B2, B2, 1); - dB = _mm_mul_pd(B1, dB); - dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3)); - - // AB = A# * B - AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3)); - AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0)); - AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3))); - AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0))); - - // dC = |C| - dC = _mm_shuffle_pd(C2, C2, 1); - dC = _mm_mul_pd(C1, dC); - dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3)); - // dD = |D| - dD = _mm_shuffle_pd(D2, D2, 1); - dD = _mm_mul_pd(D1, dD); - dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3)); - - // DC = D# * C - DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3)); - DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0)); - DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3))); - DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0))); - - // rd = trace(AB*DC) = trace(A#*B*D#*C) - d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0)); - d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3)); - rd = _mm_add_pd(d1, d2); - rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3)); - - // iD = C*A#*B - iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0)); - iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0)); - iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3))); - iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3))); - - // iA = B*D#*C - iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0)); - iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0)); - iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3))); - iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3))); - - // iD = D*|A| - C*A#*B - dA = _mm_shuffle_pd(dA,dA,0); - iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1); - iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2); - - // iA = A*|D| - B*D#*C; - dD = _mm_shuffle_pd(dD,dD,0); - iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1); - iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2); - - d1 = _mm_mul_sd(dA, dD); - d2 = _mm_mul_sd(dB, dC); - - // iB = D * (A#B)# = D*B#*A - iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1)); - iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1)); - iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2))); - iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2))); - - // det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C) - det = _mm_add_sd(d1, d2); - det = _mm_sub_sd(det, rd); - - // iC = A * (D#C)# = A*C#*D - iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1)); - iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1)); - iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2))); - iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2))); - - rd = _mm_div_sd(_mm_set_sd(1.0), det); -// #ifdef ZERO_SINGULAR -// rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd); -// #endif - rd = _mm_shuffle_pd(rd,rd,0); - - // iB = C*|B| - D*B#*A - dB = _mm_shuffle_pd(dB,dB,0); - iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1); - iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2); - - d1 = _mm_xor_pd(rd, _Sign_PN); - d2 = _mm_xor_pd(rd, _Sign_NP); - - // iC = B*|C| - A*C#*D; - dC = _mm_shuffle_pd(dC,dC,0); - iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1); - iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2); - - Index res_stride = result.outerStride(); - double* res = result.data(); - pstoret<double, Packet2d, ResultAlignment>(res+0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1)); - pstoret<double, Packet2d, ResultAlignment>(res+res_stride, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2)); - pstoret<double, Packet2d, ResultAlignment>(res+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1)); - pstoret<double, Packet2d, ResultAlignment>(res+res_stride+2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2)); - pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1)); - pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2)); - pstoret<double, Packet2d, ResultAlignment>(res+2*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1)); - pstoret<double, Packet2d, ResultAlignment>(res+3*res_stride+2,_mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INVERSE_SSE_H |