From d5b135998e0b76f3c3dbffd71291c0394f7e4aba Mon Sep 17 00:00:00 2001 From: Dario Sassi Date: Thu, 8 Oct 2020 08:17:27 +0000 Subject: [PATCH] Extern : aggiornamento Eigen alla versione 3.3.8. --- Eigen/CMakeLists.txt | 19 - Eigen/Core | 7 +- Eigen/Eigenvalues | 4 +- Eigen/Geometry | 4 +- Eigen/QR | 4 +- Eigen/SparseQR | 1 - Eigen/src/Core/ArrayBase.h | 4 +- Eigen/src/Core/CwiseUnaryView.h | 2 + Eigen/src/Core/DenseBase.h | 7 +- Eigen/src/Core/DenseStorage.h | 6 +- Eigen/src/Core/GenericPacketMath.h | 9 +- Eigen/src/Core/MapBase.h | 5 + Eigen/src/Core/MathFunctions.h | 16 +- Eigen/src/Core/MatrixBase.h | 3 +- Eigen/src/Core/PlainObjectBase.h | 12 +- Eigen/src/Core/ProductEvaluators.h | 28 +- Eigen/src/Core/Ref.h | 5 +- Eigen/src/Core/SolveTriangular.h | 6 +- Eigen/src/Core/Transpose.h | 2 + Eigen/src/Core/TriangularMatrix.h | 8 +- Eigen/src/Core/arch/AVX512/MathFunctions.h | 70 +- Eigen/src/Core/arch/AVX512/PacketMath.h | 719 +++++++++--------- Eigen/src/Core/arch/CUDA/Half.h | 1 + Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 4 +- Eigen/src/Core/functors/UnaryFunctors.h | 2 +- .../Core/products/GeneralBlockPanelKernel.h | 7 +- Eigen/src/Core/products/GeneralMatrixMatrix.h | 37 +- .../products/GeneralMatrixMatrixTriangular.h | 68 +- .../GeneralMatrixMatrixTriangular_BLAS.h | 10 +- .../Core/products/GeneralMatrixMatrix_BLAS.h | 6 +- Eigen/src/Core/products/Parallelizer.h | 15 +- .../Core/products/SelfadjointMatrixMatrix.h | 54 +- .../products/SelfadjointMatrixMatrix_BLAS.h | 24 +- Eigen/src/Core/products/SelfadjointProduct.h | 4 +- .../Core/products/TriangularMatrixMatrix.h | 54 +- .../products/TriangularMatrixMatrix_BLAS.h | 26 +- .../Core/products/TriangularSolverMatrix.h | 62 +- .../products/TriangularSolverMatrix_BLAS.h | 12 +- Eigen/src/Core/util/BlasUtil.h | 115 ++- Eigen/src/Core/util/DisableStupidWarnings.h | 13 +- Eigen/src/Core/util/ForwardDeclarations.h | 6 +- Eigen/src/Core/util/Macros.h | 58 +- Eigen/src/Core/util/Meta.h | 34 + Eigen/src/Core/util/ReenableStupidWarnings.h | 6 +- Eigen/src/Core/util/XprHelper.h | 17 + Eigen/src/Eigenvalues/ComplexSchur.h | 9 +- Eigen/src/Eigenvalues/RealSchur.h | 15 +- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 7 +- Eigen/src/Geometry/Quaternion.h | 22 +- Eigen/src/Geometry/Transform.h | 4 +- Eigen/src/Geometry/Translation.h | 6 - Eigen/src/Geometry/Umeyama.h | 2 +- Eigen/src/LU/PartialPivLU.h | 5 +- Eigen/src/LU/arch/Inverse_SSE.h | 4 +- Eigen/src/PardisoSupport/PardisoSupport.h | 3 +- Eigen/src/SVD/BDCSVD.h | 67 +- Eigen/src/SVD/SVDBase.h | 2 +- Eigen/src/SparseCholesky/SimplicialCholesky.h | 2 +- .../SparseCholesky/SimplicialCholesky_impl.h | 2 +- Eigen/src/SparseCore/AmbiVector.h | 5 +- Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 + Eigen/src/SparseCore/SparseMatrix.h | 3 +- Eigen/src/SparseCore/SparseSelfAdjointView.h | 4 +- Eigen/src/SparseCore/SparseView.h | 1 + Eigen/src/SparseLU/SparseLU.h | 4 +- Eigen/src/StlSupport/StdDeque.h | 6 +- 66 files changed, 1047 insertions(+), 704 deletions(-) delete mode 100644 Eigen/CMakeLists.txt diff --git a/Eigen/CMakeLists.txt b/Eigen/CMakeLists.txt deleted file mode 100644 index 9eb502b..0000000 --- a/Eigen/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -include(RegexUtils) -test_escape_string_as_regex() - -file(GLOB Eigen_directory_files "*") - -escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") - -foreach(f ${Eigen_directory_files}) - if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src") - list(APPEND Eigen_directory_files_to_install ${f}) - endif() -endforeach(f ${Eigen_directory_files}) - -install(FILES - ${Eigen_directory_files_to_install} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel - ) - -install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") diff --git a/Eigen/Core b/Eigen/Core index b923b8c..ac7c5b3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -279,7 +279,10 @@ #include #include #include -#include +#include +#ifndef EIGEN_NO_IO + #include +#endif #include #include #include @@ -375,7 +378,9 @@ using std::ptrdiff_t; #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX diff --git a/Eigen/Eigenvalues b/Eigen/Eigenvalues index f3f661b..7d6ac78 100644 --- a/Eigen/Eigenvalues +++ b/Eigen/Eigenvalues @@ -10,14 +10,14 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" #include "LU" #include "Geometry" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Eigenvalues_Module Eigenvalues module * * diff --git a/Eigen/Geometry b/Eigen/Geometry index 716d529..da88c03 100644 --- a/Eigen/Geometry +++ b/Eigen/Geometry @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "SVD" #include "LU" #include +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Geometry_Module Geometry module * * This module provides support for: diff --git a/Eigen/QR b/Eigen/QR index c7e9144..1be1863 100644 --- a/Eigen/QR +++ b/Eigen/QR @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup QR_Module QR module * * diff --git a/Eigen/SparseQR b/Eigen/SparseQR index a6f3b7f..f5fc5fa 100644 --- a/Eigen/SparseQR +++ b/Eigen/SparseQR @@ -28,7 +28,6 @@ * */ -#include "OrderingMethods" #include "src/SparseCore/SparseColEtree.h" #include "src/SparseQR/SparseQR.h" diff --git a/Eigen/src/Core/ArrayBase.h b/Eigen/src/Core/ArrayBase.h index 3dbc708..33f644e 100644 --- a/Eigen/src/Core/ArrayBase.h +++ b/Eigen/src/Core/ArrayBase.h @@ -153,8 +153,8 @@ template class ArrayBase // inline void evalTo(Dest& dst) const { dst = matrix(); } protected: - EIGEN_DEVICE_FUNC - ArrayBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase) private: explicit ArrayBase(Index); diff --git a/Eigen/src/Core/CwiseUnaryView.h b/Eigen/src/Core/CwiseUnaryView.h index 2710330..5a30fa8 100644 --- a/Eigen/src/Core/CwiseUnaryView.h +++ b/Eigen/src/Core/CwiseUnaryView.h @@ -121,6 +121,8 @@ class CwiseUnaryViewImpl { return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl) }; } // end namespace Eigen diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 90066ae..c55a682 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() { */ template class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN - : public DenseCoeffsBase + : public DenseCoeffsBase::value> #else : public DenseCoeffsBase #endif // not EIGEN_PARSED_BY_DOXYGEN @@ -71,7 +71,7 @@ template class DenseBase typedef Scalar value_type; typedef typename NumTraits::Real RealScalar; - typedef DenseCoeffsBase Base; + typedef DenseCoeffsBase::value> Base; using Base::derived; using Base::const_cast_derived; @@ -587,11 +587,12 @@ template class DenseBase } protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase) /** Default constructor. Do nothing. */ EIGEN_DEVICE_FUNC DenseBase() { /* Just checks for self-consistency of the flags. - * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down + * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down */ #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 7958fee..7d6d4e6 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -404,7 +404,7 @@ template class DenseStorage(m_data, m_rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; @@ -479,7 +479,7 @@ template class DenseStorage(m_data, _Rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; @@ -553,7 +553,7 @@ template class DenseStorage(m_data, _Cols*m_rows); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 029f8ac..e594437 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -351,10 +351,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - // FIXME: uncomment the following in case we drop the internal imag and real functions. -// using std::imag; -// using std::real; - return Packet(imag(a),real(a)); + return Packet(a.imag(),a.real()); } /************************** @@ -524,10 +521,10 @@ inline void palign(PacketType& first, const PacketType& second) #ifndef __CUDACC__ template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } #endif diff --git a/Eigen/src/Core/MapBase.h b/Eigen/src/Core/MapBase.h index 668922f..92c3b28 100644 --- a/Eigen/src/Core/MapBase.h +++ b/Eigen/src/Core/MapBase.h @@ -182,6 +182,8 @@ template class MapBase #endif protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) template EIGEN_DEVICE_FUNC @@ -294,6 +296,9 @@ template class MapBase // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, // see bugs 821 and 920. using ReadOnlyMapBase::Base::operator=; + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index b249ce0..01736c2 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -287,7 +287,7 @@ struct abs2_impl_default // IsComplex EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return real(x)*real(x) + imag(x)*imag(x); + return x.real()*x.real() + x.imag()*x.imag(); } }; @@ -313,14 +313,17 @@ struct abs2_retval ****************************************************************************/ template -struct norm1_default_impl +struct norm1_default_impl; + +template +struct norm1_default_impl { typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { EIGEN_USING_STD_MATH(abs); - return abs(real(x)) + abs(imag(x)); + return abs(x.real()) + abs(x.imag()); } }; @@ -662,8 +665,8 @@ struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) { - return Scalar(random(real(x), real(y)), - random(imag(x), imag(y))); + return Scalar(random(x.real(), y.real()), + random(x.imag(), y.imag())); } static inline Scalar run() { @@ -916,6 +919,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); } +EIGEN_DEVICE_FUNC +inline bool abs2(bool x) { return x; } + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index e6c3590..f8bcc8c 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -464,7 +464,8 @@ template class MatrixBase EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex& p) protected: - EIGEN_DEVICE_FUNC MatrixBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase) private: EIGEN_DEVICE_FUNC explicit MatrixBase(int); diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 1dc7e22..0f3632c 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -737,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) { - EIGEN_STATIC_ASSERT(bool(NumTraits::IsInteger) && - bool(NumTraits::IsInteger), + const bool t0_is_integer_alike = internal::is_valid_index_type::value; + const bool t1_is_integer_alike = internal::is_valid_index_type::value; + EIGEN_STATIC_ASSERT(t0_is_integer_alike && + t1_is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } @@ -773,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base::type && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) { // NOTE MSVC 2008 complains if we directly put bool(NumTraits::IsInteger) as the EIGEN_STATIC_ASSERT argument. - const bool is_integer = NumTraits::IsInteger; - EIGEN_UNUSED_VARIABLE(is_integer); - EIGEN_STATIC_ASSERT(is_integer, + const bool is_integer_alike = internal::is_valid_index_type::value; + EIGEN_UNUSED_VARIABLE(is_integer_alike); + EIGEN_STATIC_ASSERT(is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 9b99bd7..bce1310 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -396,7 +396,7 @@ struct generic_product_impl // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -410,6 +410,32 @@ struct generic_product_impl // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) diff --git a/Eigen/src/Core/Ref.h b/Eigen/src/Core/Ref.h index 9c6e3c5..17a1496 100644 --- a/Eigen/src/Core/Ref.h +++ b/Eigen/src/Core/Ref.h @@ -28,12 +28,13 @@ struct traits > template struct match { enum { + IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime, HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), - OuterStrideMatch = Derived::IsVectorAtCompileTime + OuterStrideMatch = IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), // NOTE, this indirection of evaluator::Alignment is needed // to workaround a very strange bug in MSVC related to the instantiation diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 4652e2e..fd0acb1 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -19,7 +19,7 @@ namespace internal { template struct triangular_solve_vector; -template +template struct triangular_solve_matrix; // small helper struct extracting some traits on the underlying solver operation @@ -98,8 +98,8 @@ struct triangular_solver_selector BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix - ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking); + (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime> + ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking); } }; diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 79b767b..960dc45 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -146,6 +146,8 @@ template class TransposeImpl { return derived().nestedExpression().coeffRef(index); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl) }; /** \returns an expression of the transpose of *this. diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 667ef09..9abb7e3 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -217,9 +217,7 @@ template class TriangularView explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} - using Base::operator=; - TriangularView& operator=(const TriangularView &other) - { return Base::operator=(other); } + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ EIGEN_DEVICE_FUNC @@ -544,6 +542,10 @@ template class TriangularViewImpl<_Mat template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl) + }; /*************************************************************************** diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 9c1717f..b259c1e 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -29,6 +29,7 @@ namespace internal { #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) + // Natural logarithm // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can @@ -47,6 +48,7 @@ plog(const Packet16f& _x) { // The smallest non denormalized float number. _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); // Polynomial coefficients. @@ -64,11 +66,9 @@ plog(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); // invalid_mask is set to true when x is NaN - __mmask16 invalid_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); - __mmask16 iszero_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ); - + __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); + __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ); + // Truncate input values to the minimum positive normal. x = pmax(x, p16f_min_norm_pos); @@ -118,11 +118,18 @@ plog(const Packet16f& _x) { x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN, + // - 0 will be -INF. + // - +INF will be +INF return _mm512_mask_blend_ps(iszero_mask, - _mm512_mask_blend_ps(invalid_mask, x, p16f_nan), - p16f_minus_inf); + _mm512_mask_blend_ps(invalid_mask, + _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), + p16f_nan), + p16f_minus_inf); } + #endif // Exponential function. Works by writing "x = m*log(2) + r" where @@ -258,48 +265,39 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f neg_half = pmul(_x, pset1(-.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d neg_half = pmul(_x, pset1(-.5)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d x = _mm512_rsqrt14_pd(_x); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x)); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 5adddc7..000b776 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -19,10 +19,10 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -54,13 +54,14 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) + HasBlend = 0, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, #endif HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, #endif HasDiv = 1 }; @@ -74,8 +75,8 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) - HasSqrt = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) + HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif HasDiv = 1 @@ -98,6 +99,7 @@ template <> struct unpacket_traits { typedef float type; typedef Packet8f half; + typedef Packet16i integer_packet; enum { size = 16, alignment=Aligned64 }; }; template <> @@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { } template <> EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { - return _mm512_broadcastsd_pd(_mm_load_pd1(from)); + return _mm512_set1_pd(*from); } template <> @@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, const Packet8d& b) { return _mm512_add_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, + const Packet16i& b) { + return _mm512_add_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, @@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, const Packet8d& b) { return _mm512_sub_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, + const Packet16i& b) { + return _mm512_sub_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { @@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, const Packet8d& b) { return _mm512_mul_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, + const Packet16i& b) { + return _mm512_mul_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, @@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, return _mm512_div_pd(a, b); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) { @@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, template <> EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { - return _mm512_min_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { - return _mm512_min_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_pd(b, a); } template <> EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { - return _mm512_max_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { - return _mm512_max_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_pd(b, a); +} + +#ifdef EIGEN_VECTORIZE_AVX512DQ +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } +#else +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); +} + +// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); +} + +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { + return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), + _mm256_castps_si256(b),1)); +} +#endif + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 32x16 to 16x16. +EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { + // Split data into small pieces and handle with AVX instructions + // to guarantee internal order of vector. + // Operation: + // dst[15:0] := Saturate16(rf[31:0]) + // dst[31:16] := Saturate16(rf[63:32]) + // ... + // dst[255:240] := Saturate16(rf[255:224]) + __m256i lo = _mm256_castps_si256(extract256<0>(rf)); + __m256i hi = _mm256_castps_si256(extract256<1>(rf)); + __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), + _mm256_extractf128_si256(lo, 1)); + __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), + _mm256_extractf128_si256(hi, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, + const Packet16i& b) { + return _mm512_and_si512(a,b); } template <> @@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_and_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> @@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); #endif } + template <> -EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i por(const Packet16i& a, const Packet16i& b) { + return _mm512_or_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } @@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pxor(const Packet16i& a, const Packet16i& b) { + return _mm512_xor_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } + template <> -EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pandnot(const Packet16i& a, const Packet16i& b) { + return _mm512_andnot_si512(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_ps(a, b); + return _mm512_andnot_ps(b, a); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(a, b); + return _mm512_andnot_pd(b, a); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } +template EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) { + return _mm512_srai_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) { + return _mm512_srli_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) { + return _mm512_slli_epi32(a, N); +} + template <> EIGEN_STRONG_INLINE Packet16f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); @@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { - Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane0 = _mm256_blend_ps( - lane0, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2)); - - Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4)); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane1 = _mm256_blend_ps( - lane1, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2)); + // an unaligned load is required here as there is no requirement + // on the alignment of input pointer 'from' + __m256i low_half = _mm256_loadu_si256(reinterpret_cast(from)); + __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); + __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); + return pairs; +} #ifdef EIGEN_VECTORIZE_AVX512DQ - Packet16f res = _mm512_undefined_ps(); - return _mm512_insertf32x8(res, lane0, 0); - return _mm512_insertf32x8(res, lane1, 1); - return res; -#else - Packet16f res = _mm512_undefined_ps(); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3); - return res; -#endif -} +// FIXME: this does not look optimal, better load a Packet4d and shuffle... // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); - lane0 = _mm256_permute_pd(lane0, 3 << 2); - - Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); - lane1 = _mm256_permute_pd(lane1, 3 << 2); - - Packet8d res = _mm512_undefined_pd(); - res = _mm512_insertf64x4(res, lane0, 0); - return _mm512_insertf64x4(res, lane1, 1); + __m512d x = _mm512_setzero_pd(); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); + return x; } +#else +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); + return x; +} +#endif // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_undefined_ps(); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); - return tmp; + Packet16f tmp = _mm512_castps128_ps512(ploadu(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_ps(scatter_mask, tmp); } + // Loads 2 doubles from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1} template <> EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - Packet8d tmp = _mm512_undefined_pd(); - Packet2d tmp0 = _mm_load_pd1(from); - Packet2d tmp1 = _mm_load_pd1(from + 1); - Packet4d lane0 = _mm256_broadcastsd_pd(tmp0); - Packet4d lane1 = _mm256_broadcastsd_pd(tmp1); + __m256d lane0 = _mm256_set1_pd(*from); + __m256d lane1 = _mm256_set1_pd(*(from+1)); + __m512d tmp = _mm512_undefined_pd(); tmp = _mm512_insertf64x4(tmp, lane0, 0); return _mm512_insertf64x4(tmp, lane1, 1); } @@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -586,7 +560,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -596,7 +570,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -660,8 +634,8 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ - __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ - _mm512_extractf32x8_ps(INPUT, 1) + __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ + __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) #else #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ @@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_undefined_ps(); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); #endif -template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* -vecs) + +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f x = _mm256_add_ps(lane0, lane1); + return predux(x); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); + sum = _mm_hadd_ps(sum, sum); + sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); + return _mm_cvtss_f32(sum); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d sum = _mm256_add_pd(lane0, lane1); + __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); + return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); +} + +template <> +EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return padd(lane0, lane1); +#else + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f sum0 = padd(lane0, lane2); + Packet4f sum1 = padd(lane1, lane3); + return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = padd(lane0, lane1); + return res; +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { +//#ifdef EIGEN_VECTORIZE_AVX512DQ +#if 0 + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); + res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_min_pd(lane0, lane1); + res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); + res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_max_pd(lane0, lane1); + res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* vecs) { EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); @@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) return _mm512_insertf64x4(final_output, final_1, 1); } - -template <> -EIGEN_STRONG_INLINE float predux(const Packet16f& a) { - //#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f sum = padd(lane0, lane1); - Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1)); - tmp0 = _mm256_hadd_ps(tmp0, tmp0); - return pfirst(_mm256_hadd_ps(tmp0, tmp0)); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3)); - sum = _mm_hadd_ps(sum, sum); - sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); - return pfirst(sum); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d sum = padd(lane0, lane1); - Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); - return pfirst(_mm256_hadd_pd(tmp0, tmp0)); -} - -template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - return padd(lane0, lane1); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum0 = padd(lane0, lane2); - Packet4f sum1 = padd(lane1, lane3); - return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); -#endif -} -template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = padd(lane0, lane1); - return res; -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { -//#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); - res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_min_pd(lane0, lane1); - res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); - res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_max_pd(lane0, lane1); - res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); - - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); - - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); - - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); - } - } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); - } - } -}; + #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ @@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, return Packet16f(); } template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, - const Packet8d& /*thenPacket*/, - const Packet8d& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet8d(); +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, + const Packet8d& thenPacket, + const Packet8d& elsePacket) { + __mmask8 m = (ifPacket.select[0] ) + | (ifPacket.select[1]<<1) + | (ifPacket.select[2]<<2) + | (ifPacket.select[3]<<3) + | (ifPacket.select[4]<<4) + | (ifPacket.select[5]<<5) + | (ifPacket.select[6]<<6) + | (ifPacket.select[7]<<7); + return _mm512_mask_blend_pd(m, elsePacket, thenPacket); } +template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return _mm512_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return _mm512_cvtepi32_ps(a); +} + +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); + + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); + + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); + } + } +}; +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); + + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); + + unsigned char mask = 0xFF; + mask <<= (8 - Offset); + + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); + } + } +}; + + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h index 755e620..59717b4 100644 --- a/Eigen/src/Core/arch/CUDA/Half.h +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -42,6 +42,7 @@ #define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() #endif +#include namespace Eigen { diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index c66d384..f749c57 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -230,7 +230,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); + return Eigen::half(__float2half_rn(a1 + a2)); #endif } @@ -264,7 +264,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); + return Eigen::half(__float2half_rn(a1 * a2)); #endif } diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index 2e6a00f..b56e7af 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -768,7 +768,7 @@ struct scalar_sign_op { if (aa==real_type(0)) return Scalar(0); aa = real_type(1)/aa; - return Scalar(real(a)*aa, imag(a)*aa ); + return Scalar(a.real()*aa, a.imag()*aa ); } //TODO //template diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index e3980f6..681451c 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi(kr, (numext::mini)((l1-ksub)/kdiv, 320)); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -648,8 +649,8 @@ public: // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index 6440e1d..ed6234c 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -20,8 +20,9 @@ template class level3_blocking; template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -30,7 +31,7 @@ struct general_matrix_matrix_product& blocking, GemmParallelInfo* info = 0) @@ -39,8 +40,8 @@ struct general_matrix_matrix_product - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + ColMajor,ResInnerStride> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info); } }; @@ -49,8 +50,9 @@ struct general_matrix_matrix_product -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits::ReturnType ResScala static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resStride, + ResScalar* _res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -226,7 +228,7 @@ struct gemm_functor Gemm::run(rows, cols, m_lhs.cols(), &m_lhs.coeffRef(row,0), m_lhs.outerStride(), &m_rhs.coeffRef(0,col), m_rhs.outerStride(), - (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(), + (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(), m_actualAlpha, m_blocking, info); } @@ -428,7 +430,7 @@ struct generic_product_impl static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -440,7 +442,7 @@ struct generic_product_impl static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -449,7 +451,7 @@ struct generic_product_impl static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } @@ -476,7 +478,8 @@ struct generic_product_impl Index, LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index e844e37..d68d2f9 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -25,51 +25,54 @@ namespace internal { **********************************************************************/ // forward declarations (defined at the end of this file) -template +template struct tribb_kernel; /* Optimized matrix-matrix product evaluating only one triangular half */ template + int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized> struct general_matrix_matrix_triangular_product; // as usual if the result is row major => we transpose the product template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); + ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking); } }; template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; - tribb_kernel sybb; + tribb_kernel sybb; for(Index k2=0; k2 +template struct tribb_kernel { typedef gebp_traits Traits; @@ -142,11 +144,13 @@ struct tribb_kernel enum { BlockSize = meta_least_common_multiple::ret }; - void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - typedef blas_data_mapper ResMapper; - ResMapper res(_res, resStride); - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + typedef blas_data_mapper BufferMapper; + ResMapper res(_res, resStride, resIncr); + gebp_kernel gebp_kernel1; + gebp_kernel gebp_kernel2; Matrix buffer((internal::constructor_without_unaligned_array_assert())); @@ -158,31 +162,32 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0); - + gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // 2 - triangular accumulation for(Index j1=0; j1 internal::general_matrix_matrix_triangular_product + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)> ::run(size, depth, &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), - mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking); + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0), + mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index f6f9ebe..691f95d 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -40,7 +40,7 @@ namespace internal { template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< - Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; // try to go to BLAS specialization @@ -48,9 +48,9 @@ struct general_matrix_matrix_rankupdate : template \ struct general_matrix_matrix_triangular_product { \ + Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) \ { \ if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ @@ -59,8 +59,8 @@ struct general_matrix_matrix_triangular_product \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + ColMajor, 1, UpLo, BuiltIn> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \ } \ } \ }; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h index b0f6b0d..71abf40 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h @@ -51,20 +51,22 @@ template< \ typename Index, \ int LhsStorageOrder, bool ConjugateLhs, \ int RhsStorageOrder, bool ConjugateRhs> \ -struct general_matrix_matrix_product \ +struct general_matrix_matrix_product \ { \ typedef gebp_traits Traits; \ \ static void run(Index rows, Index cols, Index depth, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, \ level3_blocking& /*blocking*/, \ GemmParallelInfo* /*info = 0*/) \ { \ using std::conj; \ \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char transa, transb; \ BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ diff --git a/Eigen/src/Core/products/Parallelizer.h b/Eigen/src/Core/products/Parallelizer.h index c2f084c..67b2442 100644 --- a/Eigen/src/Core/products/Parallelizer.h +++ b/Eigen/src/Core/products/Parallelizer.h @@ -17,7 +17,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads); if(action==SetAction) { @@ -131,7 +132,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(GemmParallelInfo,info,threads,0); - #pragma omp parallel num_threads(threads) + int errorCount = 0; + #pragma omp parallel num_threads(threads) reduction(+: errorCount) { Index i = omp_get_thread_num(); // Note that the actual number of threads might be lower than the number of request ones. @@ -150,9 +152,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; - if(transpose) func(c0, actualBlockCols, 0, rows, info); - else func(0, rows, c0, actualBlockCols, info); + EIGEN_TRY { + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); + } EIGEN_CATCH(...) { + ++errorCount; + } } + if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception()); #endif } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index da6f82a..04c9334 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -277,20 +277,21 @@ struct symm_pack_rhs template + int ResStorageOrder, int ResInnerStride> struct product_selfadjoint_matrix; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), - ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor,ResInnerStride> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef const_blas_data_mapper LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Traits; typedef const_blas_data_mapper LhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride); + ResMapper res(_res,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -503,12 +508,13 @@ struct selfadjoint_product_impl NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), - internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> + internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime> ::run( lhs.rows(), rhs.cols(), // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking // alpha ); } diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index 9a53185..61396db 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -44,16 +44,18 @@ namespace internal { template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -91,15 +93,17 @@ struct product_selfadjoint_matrix \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -167,16 +171,18 @@ EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -213,15 +219,17 @@ struct product_selfadjoint_matrix \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ diff --git a/Eigen/src/Core/products/SelfadjointProduct.h b/Eigen/src/Core/products/SelfadjointProduct.h index f038d68..ef12c98 100644 --- a/Eigen/src/Core/products/SelfadjointProduct.h +++ b/Eigen/src/Core/products/SelfadjointProduct.h @@ -109,10 +109,10 @@ struct selfadjoint_product_selector internal::general_matrix_matrix_triangular_product::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, - IsRowMajor ? RowMajor : ColMajor, UpLo> + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index f784507..2fb408d 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -45,22 +45,24 @@ template + int ResStorageOrder, int ResInnerStride, + int Version = Specialized> struct product_triangular_matrix_matrix; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version> { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_triangular_matrix_matrix - ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor, ResInnerStride> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; @@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -235,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; enum { @@ -251,20 +256,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -276,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -433,12 +439,12 @@ struct triangular_product_impl Mode, LhsIsTriangular, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> + (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking ); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index a25197a..a98d12e 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -46,7 +46,7 @@ template {}; + RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {}; // try to go to BLAS specialization @@ -55,13 +55,15 @@ template \ struct product_triangular_matrix_matrix { \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ - const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ } \ }; @@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -232,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -241,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b..e3ed2cd 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -15,48 +15,48 @@ namespace Eigen { namespace internal { // if the rhs is row major, let's transpose the product -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static void run( Index size, Index cols, const Scalar* tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { triangular_solve_matrix< Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), NumTraits::IsComplex && Conjugate, - TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> - ::run(size, cols, tri, triStride, _other, otherStride, blocking); + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride> + ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking); } }; /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left */ -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index cols = otherSize; typedef const_blas_data_mapper TriMapper; - typedef blas_data_mapper OtherMapper; + typedef blas_data_mapper OtherMapper; TriMapper tri(_tri, triStride); - OtherMapper other(_other, otherStride); + OtherMapper other(_other, otherStride, otherIncr); typedef gebp_traits Traits; @@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index rows = otherSize; typedef typename NumTraits::Real RealScalar; - typedef blas_data_mapper LhsMapper; + typedef blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - LhsMapper lhs(_other, otherStride); + LhsMapper lhs(_other, otherStride, otherIncr); RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; @@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -51,8 +51,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -99,7 +101,7 @@ EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_) // implements RightSide general * op(triangular)^-1 #define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \ template \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -110,8 +112,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 6e6ee11..3dff9bc 100644 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -31,7 +31,7 @@ template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, - int ResStorageOrder> + int ResStorageOrder, int ResInnerStride> struct general_matrix_matrix_product; template +class BlasLinearMapper; + template -class BlasLinearMapper { +class BlasLinearMapper { public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) + : m_data(data) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -188,16 +196,25 @@ class BlasLinearMapper { }; // Lightweight helper class to access matrix coefficients. -template -class blas_data_mapper { - public: +template +class blas_data_mapper; + +template +class blas_data_mapper +{ +public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1) + : m_data(data), m_stride(stride) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper getSubMapper(Index i, Index j) const { @@ -251,6 +268,90 @@ class blas_data_mapper { const Index m_stride; }; +// Implementation of non-natural increment (i.e. inner-stride != 1) +// The exposed API is not complete yet compared to the Incr==1 case +// because some features makes less sense in this case. +template +class BlasLinearMapper +{ +public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i*m_incr.value()]; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pscatter(m_data + i*m_incr.value(), p, m_incr.value()); + } + +protected: + Scalar *m_data; + const internal::variable_if_dynamic m_incr; +}; + +template +class blas_data_mapper +{ +public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef BlasLinearMapper LinearMapper; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value()); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j), m_incr.value()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + +protected: + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; + const internal::variable_if_dynamic m_incr; +}; + // lightweight helper class to access matrix coefficients (const version) template class const_blas_data_mapper : public blas_data_mapper { diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 351bd6c..74f74cc 100644 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -57,7 +57,10 @@ #if __GNUC__>=6 #pragma GCC diagnostic ignored "-Wignored-attributes" #endif - + #if __GNUC__==7 + // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325 + #pragma GCC diagnostic ignored "-Wattributes" + #endif #endif #if defined __NVCC__ @@ -80,4 +83,12 @@ #pragma diag_suppress 2737 #endif +#else +// warnings already disabled: +# ifndef EIGEN_WARNINGS_DISABLED_2 +# define EIGEN_WARNINGS_DISABLED_2 +# elif defined(EIGEN_INTERNAL_DEBUGGING) +# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!" +# endif + #endif // not EIGEN_WARNINGS_DISABLED diff --git a/Eigen/src/Core/util/ForwardDeclarations.h b/Eigen/src/Core/util/ForwardDeclarations.h index ea10739..134544f 100644 --- a/Eigen/src/Core/util/ForwardDeclarations.h +++ b/Eigen/src/Core/util/ForwardDeclarations.h @@ -47,11 +47,7 @@ template struct NumTraits; template struct EigenBase; template class DenseBase; template class PlainObjectBase; - - -template::value > -class DenseCoeffsBase; +template class DenseCoeffsBase; templatex || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -380,7 +380,8 @@ #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ - || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))) + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ + || (EIGEN_COMP_MSVC >= 1900) ) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -396,6 +397,20 @@ #endif #endif +// Does the compiler support type_traits? +// - full support of type traits was added only to GCC 5.1.0. +// - 20150626 corresponds to the last release of 4.x libstdc++ +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \ + && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \ + && ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ @@ -835,11 +850,48 @@ namespace Eigen { #endif +/** + * \internal + * \brief Macro to explicitly define the default copy constructor. + * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; +#else +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) +#endif + + + /** \internal * \brief Macro to manually inherit assignment operators. * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * With C++11 or later this also default-implements the copy-constructor */ -#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +/** \internal + * \brief Macro to manually define default constructors and destructors. + * This is necessary when the copy constructor is re-defined. + * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects. + * + * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; +#else +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + EIGEN_DEVICE_FUNC Derived() {}; \ + /* EIGEN_DEVICE_FUNC ~Derived() {}; */ +#endif + + + + /** * Just a side note. Commenting within defines works only by documenting diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index d31e954..9b61ff0 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -97,6 +97,9 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +#if EIGEN_HAS_CXX11 +using std::is_integral; +#else template struct is_integral { enum { value = false }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; @@ -108,6 +111,11 @@ template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral{ enum { value = true }; }; +#endif +#endif #if EIGEN_HAS_CXX11 using std::make_unsigned; @@ -531,4 +539,30 @@ bool not_equal_strict(const double& x,const double& y) { return std::not_equal_t } // end namespace Eigen +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 +#include +namespace Eigen { +namespace numext { +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provides the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} +#endif + + #endif // EIGEN_META_H diff --git a/Eigen/src/Core/util/ReenableStupidWarnings.h b/Eigen/src/Core/util/ReenableStupidWarnings.h index ecc82b7..1ce6fd1 100644 --- a/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -1,4 +1,8 @@ -#ifdef EIGEN_WARNINGS_DISABLED +#ifdef EIGEN_WARNINGS_DISABLED_2 +// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet! +# undef EIGEN_WARNINGS_DISABLED_2 + +#elif defined(EIGEN_WARNINGS_DISABLED) #undef EIGEN_WARNINGS_DISABLED #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index ba5bd18..6bb4970 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,20 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template struct is_valid_index_type +{ + enum { value = +#if EIGEN_HAS_TYPE_TRAITS + internal::is_integral::value || std::is_enum::value +#elif EIGEN_COMP_MSVC + internal::is_integral::value || __is_enum(T) +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value && !internal::is_same::value && !is_same::value +#endif + }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar @@ -90,6 +104,9 @@ class no_assignment_operator { private: no_assignment_operator& operator=(const no_assignment_operator&); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator) }; /** \internal return the index type with the largest number of bits */ diff --git a/Eigen/src/Eigenvalues/ComplexSchur.h b/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919..4354e40 100644 --- a/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/Eigen/src/Eigenvalues/ComplexSchur.h @@ -300,10 +300,13 @@ typename ComplexSchur::ComplexScalar ComplexSchur::compu ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1); ComplexScalar eival1 = (trace + disc) / RealScalar(2); ComplexScalar eival2 = (trace - disc) / RealScalar(2); - - if(numext::norm1(eival1) > numext::norm1(eival2)) + RealScalar eival1_norm = numext::norm1(eival1); + RealScalar eival2_norm = numext::norm1(eival2); + // A division by zero can only occur if eival1==eival2==0. + // In this case, det==0, and all we have to do is checking that eival2_norm!=0 + if(eival1_norm > eival2_norm) eival2 = det / eival1; - else + else if(eival2_norm!=RealScalar(0)) eival1 = det / eival2; // choose the eigenvalue closest to the bottom entry of the diagonal diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 17ea903..9191519 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -236,7 +236,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu); + Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -302,12 +302,16 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa Index totalIter = 0; // iteration count for whole matrix Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); + // sub-diagonal entries smaller than considerAsZero will be treated as zero. + // We use eps^2 to enable more precision in small eigenvalues. + Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), + (std::numeric_limits::min)() ); if(norm!=Scalar(0)) { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu); + Index il = findSmallSubdiagEntry(iu,considerAsZero); // Check for convergence if (il == iu) // One root found @@ -364,14 +368,17 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline Index RealSchur::findSmallSubdiagEntry(Index iu) +inline Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) + + s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); + + if (abs(m_matT.coeff(res,res-1)) <= s) break; res--; } diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9ddd553..d37656f 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -605,7 +605,8 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD_MATH(abs) Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -616,8 +617,8 @@ template struct direct_selfadjoint_eigenvaluesn1) res = c0/std::sqrt(n0); - else res = c1/std::sqrt(n1); + if(n0>n1) res = c0/sqrt(n0); + else res = c1/sqrt(n1); return true; } diff --git a/Eigen/src/Geometry/Quaternion.h b/Eigen/src/Geometry/Quaternion.h index c3fd8c3..b818206 100644 --- a/Eigen/src/Geometry/Quaternion.h +++ b/Eigen/src/Geometry/Quaternion.h @@ -169,20 +169,38 @@ class QuaternionBase : public RotationBase /** return the result vector of \a v through the rotation*/ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const; + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. */ template - EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const + EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const; + + #else + + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,const Derived&>::type cast() const { - return typename internal::cast_return_type >::type(derived()); + return derived(); } + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,Quaternion >::type cast() const + { + return Quaternion(coeffs().template cast()); + } + #endif + #ifdef EIGEN_QUATERNIONBASE_PLUGIN # include EIGEN_QUATERNIONBASE_PLUGIN #endif +protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase) }; /*************************************************************************** diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 3f31ee4..c21d9e5 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -252,11 +252,11 @@ protected: public: /** Default constructor without initialization of the meaningful coefficients. - * If Mode==Affine, then the last row is set to [0 ... 0 1] */ + * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */ EIGEN_DEVICE_FUNC inline Transform() { check_template_params(); - internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); + internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } EIGEN_DEVICE_FUNC inline Transform(const Transform& other) diff --git a/Eigen/src/Geometry/Translation.h b/Eigen/src/Geometry/Translation.h index 51d9a82..0e99ce6 100644 --- a/Eigen/src/Geometry/Translation.h +++ b/Eigen/src/Geometry/Translation.h @@ -138,12 +138,6 @@ public: /** \returns the inverse translation (opposite) */ Translation inverse() const { return Translation(-m_coeffs); } - Translation& operator=(const Translation& other) - { - m_coeffs = other.m_coeffs; - return *this; - } - static const Translation Identity() { return Translation(VectorType::Zero()); } /** \returns \c *this with scalar type casted to \a NewScalarType diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 7e933fc..6b75500 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -87,7 +87,7 @@ struct umeyama_transform_matrix_type * \f{align*} * T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix} * \f} -* minimizing the resudiual above. This transformation is always returned as an +* minimizing the residual above. This transformation is always returned as an * Eigen::Matrix. */ template diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index d439618..6b10f39 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -519,7 +519,10 @@ void PartialPivLU::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/Eigen/src/LU/arch/Inverse_SSE.h b/Eigen/src/LU/arch/Inverse_SSE.h index ebb64a6..4dce2ef 100644 --- a/Eigen/src/LU/arch/Inverse_SSE.h +++ b/Eigen/src/LU/arch/Inverse_SSE.h @@ -44,7 +44,7 @@ struct compute_inverse_size4 static void run(const MatrixType& mat, ResultType& result) { ActualMatrixType matrix(mat); - EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; + const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); // Load the full matrix into registers __m128 _L1 = matrix.template packet( 0); @@ -139,7 +139,7 @@ struct compute_inverse_size4 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); + rd = _mm_xor_ps(rd, p4f_sign_PNNP); // iB = C*|B| - D*B#*A iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); diff --git a/Eigen/src/PardisoSupport/PardisoSupport.h b/Eigen/src/PardisoSupport/PardisoSupport.h index 091c397..f8c7d07 100644 --- a/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/Eigen/src/PardisoSupport/PardisoSupport.h @@ -192,7 +192,8 @@ class PardisoImpl : public SparseSolverBase void pardisoInit(int type) { m_type = type; - bool symmetric = std::abs(m_type) < 10; + EIGEN_USING_STD(abs); + bool symmetric = abs(m_type) < 10; m_iparm[0] = 1; // No solver default m_iparm[1] = 2; // use Metis for the ordering m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 1134d66..a5b73f8 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -768,6 +768,21 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); diagShifted = diag - shift; + + if(k!=actual_n-1) + { + // check that after the shift, f(mid) is still negative: + RealScalar midShifted = (right - left) / RealScalar(2); + if(shift==right) + midShifted = -midShifted; + RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + if(fMidShifted>0) + { + // fMid was erroneous, fix it: + shift = fMidShifted > Literal(0) ? left : right; + diagShifted = diag - shift; + } + } // initial guess RealScalar muPrev, muCur; @@ -845,11 +860,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert(fLeft::computeSingVals(const ArrayRef& col0, const ArrayRef& d } #endif eigen_internal_assert(fLeft * fRight < Literal(0)); - - while (rightShifted - leftShifted > Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) - { - RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); - fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); - if (fLeft * fMid < Literal(0)) - { - rightShifted = midShifted; - } - else - { - leftShifted = midShifted; - fLeft = fMid; - } - } - muCur = (leftShifted + rightShifted) / Literal(2); + if(fLeft Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + { + RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); + fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert((numext::isfinite)(fMid)); + + if (fLeft * fMid < Literal(0)) + { + rightShifted = midShifted; + } + else + { + leftShifted = midShifted; + fLeft = fMid; + } + } + muCur = (leftShifted + rightShifted) / Literal(2); + } + else + { + // We have a problem as shifting on the left or right give either a positive or negative value + // at the middle of [left,right]... + // Instead fo abbording or entering an infinite loop, + // let's just use the middle as the estimated zero-crossing: + muCur = (right - left) * RealScalar(0.5); + if(shift == right) + muCur = -muCur; + } } singVals[k] = shift + muCur; @@ -924,7 +955,7 @@ void BDCSVD::perturbCol0 Index j = i 0.9 ) + if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n"; #endif @@ -934,7 +965,7 @@ void BDCSVD::perturbCol0 std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); - zhat(k) = col0(k) > Literal(0) ? tmp : -tmp; + zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } } diff --git a/Eigen/src/SVD/SVDBase.h b/Eigen/src/SVD/SVDBase.h index 3d1ef37..53da284 100644 --- a/Eigen/src/SVD/SVDBase.h +++ b/Eigen/src/SVD/SVDBase.h @@ -183,7 +183,7 @@ public: // this temporary is needed to workaround a MSVC issue Index diagSize = (std::max)(1,m_diagSize); return m_usePrescribedThreshold ? m_prescribedThreshold - : diagSize*NumTraits::epsilon(); + : RealScalar(diagSize)*NumTraits::epsilon(); } /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */ diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f65..369e680 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -608,7 +608,7 @@ public: } if(Base::m_diag.size()>0) - dest = Base::m_diag.asDiagonal().inverse() * dest; + dest = Base::m_diag.real().asDiagonal().inverse() * dest; if (Base::m_matrix.nonZeros()>0) // otherwise I==I { diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 31e0699..7b6183d 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -156,7 +156,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* the nonzero entry L(k,i) */ Scalar l_ki; if(DoLDLT) - l_ki = yi / m_diag[i]; + l_ki = yi / numext::real(m_diag[i]); else yi = l_ki = yi / Lx[Lp[i]]; diff --git a/Eigen/src/SparseCore/AmbiVector.h b/Eigen/src/SparseCore/AmbiVector.h index e0295f2..2cb7747 100644 --- a/Eigen/src/SparseCore/AmbiVector.h +++ b/Eigen/src/SparseCore/AmbiVector.h @@ -28,7 +28,7 @@ class AmbiVector typedef typename NumTraits::Real RealScalar; explicit AmbiVector(Index size) - : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) + : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) { resize(size); } @@ -147,7 +147,8 @@ template void AmbiVector<_Scalar,_StorageIndex>::init(int mode) { m_mode = mode; - if (m_mode==IsSparse) + // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings + // if (m_mode==IsSparse) { m_llSize = 0; m_llStart = -1; diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index ea79737..df6c28d 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -49,6 +49,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: @@ -99,6 +100,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 0a2490b..a539653 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -327,7 +327,8 @@ class SparseMatrix m_outerIndex[j] = newOuterIndex[j]; m_innerNonZeros[j] = innerNNZ; } - m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; + if(m_outerSize>0) + m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; m_data.resize(m_outerIndex[m_outerSize]); } diff --git a/Eigen/src/SparseCore/SparseSelfAdjointView.h b/Eigen/src/SparseCore/SparseSelfAdjointView.h index 65611b3..76117a0 100644 --- a/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -453,7 +453,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix, IteratorBased> class InnerIterator : public EvalIterator { + protected: typedef typename XprType::Scalar Scalar; public: diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 7104831..87f0efe 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -43,8 +43,8 @@ template struct SparseLUMatrixURetu * Simple example with key steps * \code * VectorXd x(n), b(n); - * SparseMatrix A; - * SparseLU, COLAMDOrdering > solver; + * SparseMatrix A; + * SparseLU, COLAMDOrdering > solver; * // fill A and b; * // Compute the ordering permutation vector from the structural pattern of A * solver.analyzePattern(A); diff --git a/Eigen/src/StlSupport/StdDeque.h b/Eigen/src/StlSupport/StdDeque.h index cf1fedf..af158f4 100644 --- a/Eigen/src/StlSupport/StdDeque.h +++ b/Eigen/src/StlSupport/StdDeque.h @@ -98,8 +98,10 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) +#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1) // workaround GCC std::deque implementation + // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to + // fall-back to the default case void resize(size_type new_size, const value_type& x) { if (new_size < deque_base::size()) @@ -108,7 +110,7 @@ namespace std { deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); } #else - // either GCC 4.1 or non-GCC + // either non-GCC or GCC between 4.1 and 10.1 // default implementation which should always work. void resize(size_type new_size, const value_type& x) {