diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport index bed8924..1037bd5 100644 --- a/Eigen/CholmodSupport +++ b/Eigen/CholmodSupport @@ -22,7 +22,7 @@ extern "C" { * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. - * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). + * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). * * For the sake of completeness, this module also propose the two following classes: * - class CholmodSimplicialLLT diff --git a/Eigen/Core b/Eigen/Core index 5921e15..bb16c86 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -83,8 +83,8 @@ #include #include #include -#include #ifndef EIGEN_NO_IO + #include #include #endif #include @@ -109,7 +109,8 @@ #endif // required for __cpuid, needs to be included after cmath -#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE +// also required for _BitScanReverse on Windows on ARM +#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE #include #endif @@ -346,7 +347,7 @@ using std::ptrdiff_t; #include "src/Core/CoreIterators.h" #include "src/Core/ConditionEstimator.h" -#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) +#if defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/MatrixProduct.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h" diff --git a/Eigen/SparseLU b/Eigen/SparseLU index 37c4a5c..047cf0d 100644 --- a/Eigen/SparseLU +++ b/Eigen/SparseLU @@ -25,8 +25,6 @@ #include "src/Core/util/DisableStupidWarnings.h" -#include "src/SparseLU/SparseLU_gemm_kernel.h" - #include "src/SparseLU/SparseLU_Structs.h" #include "src/SparseLU/SparseLU_SupernodalMatrix.h" #include "src/SparseLU/SparseLUImpl.h" diff --git a/Eigen/src/Core/ArithmeticSequence.h b/Eigen/src/Core/ArithmeticSequence.h index b6200fa..d04f726 100644 --- a/Eigen/src/Core/ArithmeticSequence.h +++ b/Eigen/src/Core/ArithmeticSequence.h @@ -172,7 +172,8 @@ seqN(FirstType first, SizeType size) { return ArithmeticSequence::type,typename internal::cleanup_index_type::type>(first,size); } -#ifdef EIGEN_PARSED_BY_DOXYGEN + +#if EIGEN_HAS_CXX11 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr * @@ -183,24 +184,6 @@ seqN(FirstType first, SizeType size) { * * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */ -template -auto seq(FirstType f, LastType l, IncrType incr); - -/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment - * - * It is essentially an alias to: - * \code - * seqN(f,l-f+1); - * \endcode - * - * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) - */ -template -auto seq(FirstType f, LastType l); - -#else // EIGEN_PARSED_BY_DOXYGEN - -#if EIGEN_HAS_CXX11 template auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), ( typename internal::cleanup_index_type::type(l) @@ -211,6 +194,15 @@ auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_in -typename internal::cleanup_index_type::type(f)+fix<1>())); } +/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment + * + * It is essentially an alias to: + * \code + * seqN(f,l-f+1); + * \endcode + * + * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) + */ template auto seq(FirstType f, LastType l, IncrType incr) -> decltype(seqN(typename internal::cleanup_index_type::type(f), @@ -317,26 +309,12 @@ seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr)*incr, size, incr) \endcode - * - * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ -template -auto lastN(SizeType size, IncrType incr) --> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) -{ - return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); -} - +#if EIGEN_HAS_CXX11 /** \cpp11 * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. * + * \anchor indexing_lastN + * * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode * * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ @@ -346,6 +324,21 @@ auto lastN(SizeType size) { return seqN(Eigen::last+fix<1>()-size, size); } + +/** \cpp11 + * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. + * + * \anchor indexing_lastN_with_incr + * + * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode + * + * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ +template +auto lastN(SizeType size, IncrType incr) +-> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) +{ + return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); +} #endif namespace internal { diff --git a/Eigen/src/Core/Array.h b/Eigen/src/Core/Array.h index 20c789b..6d50ea4 100644 --- a/Eigen/src/Core/Array.h +++ b/Eigen/src/Core/Array.h @@ -163,7 +163,15 @@ class Array #endif #if EIGEN_HAS_CXX11 - /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. * * Example: \include Array_variadic_ctor_cxx11.cpp * Output: \verbinclude Array_variadic_ctor_cxx11.out diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 3206d66..9d89b60 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -260,19 +260,19 @@ template - inline PacketScalar packet(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const { return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template - inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template - inline PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const { return m_xpr.template packet (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -280,7 +280,7 @@ template - inline void writePacket(Index index, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val) { m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -334,6 +334,17 @@ class BlockImpl_dense enum { XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0 }; + + /** \internal Returns base+offset (unless base is null, in which case returns null). + * Adding an offset to nullptr is undefined behavior, so we must avoid it. + */ + template + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE + static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) + { + return base != NULL ? base+offset : NULL; + } + public: typedef MapBase Base; @@ -344,8 +355,9 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) - || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), m_xpr(xpr), @@ -359,7 +371,8 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); @@ -371,7 +384,9 @@ class BlockImpl_dense BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), + : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + blockRows, blockCols), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index 852de8b..fa4d7c3 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -14,54 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; - + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; @@ -85,12 +87,12 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::RowsAtCompileTime>::run(evaluator); + return internal::all_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (!evaluator.coeff(i, j)) return false; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false; return true; } } @@ -109,12 +111,12 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::RowsAtCompileTime>::run(evaluator); + return internal::any_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (evaluator.coeff(i, j)) return true; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true; return false; } } @@ -156,7 +158,7 @@ inline bool DenseBase::allFinite() const return !((derived()-derived()).hasNaN()); #endif } - + } // end namespace Eigen #endif // EIGEN_ALLANDANY_H diff --git a/Eigen/src/Core/CwiseNullaryOp.h b/Eigen/src/Core/CwiseNullaryOp.h index 289ec51..ba07e71 100644 --- a/Eigen/src/Core/CwiseNullaryOp.h +++ b/Eigen/src/Core/CwiseNullaryOp.h @@ -292,7 +292,7 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) } /** - * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&) + * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&) * Special version for fixed size types which does not require the size parameter. */ template diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 9b16db6..cdd0f5f 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -324,9 +324,9 @@ template class DenseBase typedef Transpose TransposeReturnType; EIGEN_DEVICE_FUNC TransposeReturnType transpose(); - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; EIGEN_DEVICE_FUNC - ConstTransposeReturnType transpose() const; + const ConstTransposeReturnType transpose() const; EIGEN_DEVICE_FUNC void transposeInPlace(); diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 3112d2c..ad5bccd 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -191,7 +191,8 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline +const typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -209,18 +210,18 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline Diagonal MatrixBase::diagonal(Index index) { - return DiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline const Diagonal MatrixBase::diagonal(Index index) const { - return ConstDiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this @@ -237,20 +238,20 @@ MatrixBase::diagonal(Index index) const template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template DiagonalIndexReturnType::Type +inline Diagonal MatrixBase::diagonal() { - return typename DiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } /** This is the const version of diagonal(). */ template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type +inline const Diagonal MatrixBase::diagonal() const { - return typename ConstDiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } } // end namespace Eigen diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 5c3441b..abac7ad 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -18,14 +18,9 @@ namespace internal { // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE // looking at the static assertions. Thus this is a trick to get better compile errors. template + bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime && + ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) || + (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))> struct dot_nocheck { typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index cf677a1..af45f39 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -160,7 +160,7 @@ struct eigen_packet_wrapper { EIGEN_ALWAYS_INLINE operator T&() { return m_val; } EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}; EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { m_val = v; @@ -258,7 +258,7 @@ struct ptrue_impl { // uses a comparison to zero, so this should still work in most cases. We don't // have another option, since the scalar type requires initialization. template -struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ return T(1); @@ -356,16 +356,16 @@ struct bytewise_bitwise_helper { EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return binary(a, b, bit_and()); } - EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return binary(a, b, bit_or()); } EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return binary(a, b, bit_xor()); } - EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return unary(a,bit_not()); } - + private: template EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { @@ -440,7 +440,7 @@ struct pselect_impl { // For scalars, use ternary select. template -struct pselect_impl::value>::type > { static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { return numext::equal_strict(mask, Packet(0)) ? b : a; @@ -807,7 +807,7 @@ Packet plog10(const Packet& a) { EIGEN_USING_STD(log10); return log10(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet& a) { typedef typename internal::unpacket_traits::type Scalar; - return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); + return pmul(pset1(Scalar(EIGEN_LOG2E)), plog(a)); } /** \internal \returns the square-root of \a a (coeff-wise) */ @@ -881,7 +881,7 @@ predux(const Packet& a) template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul))); } @@ -889,14 +889,14 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul( template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( const Packet &a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin))); } @@ -904,14 +904,14 @@ EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min( template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( const Packet &a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); } template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max( const Packet& a) { - typedef typename unpacket_traits::type Scalar; + typedef typename unpacket_traits::type Scalar; return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax))); } diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 0847625..05c2bc9 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -122,10 +122,10 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } + Index rows() const { return internal::index_list_size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } + Index cols() const { return internal::index_list_size(m_colIndices); } /** \returns the nested expression */ const typename internal::remove_all::type& @@ -189,12 +189,16 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -204,6 +208,8 @@ struct unary_evaluator, IndexBased> EIGEN_STATIC_ASSERT_LVALUE(XprType) Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -212,6 +218,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -220,6 +228,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 61b78f4..764c41c 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -588,12 +588,8 @@ struct arg_default_impl { EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - // HIP does not seem to have a native device side implementation for the math routine "arg" + // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg. using std::arg; - #else - EIGEN_USING_STD(arg); - #endif return static_cast(arg(x)); } }; @@ -881,13 +877,159 @@ struct meta_floor_log2 // no value, error at compile time }; -template -struct random_default_impl -{ - static inline Scalar run(const Scalar& x, const Scalar& y) - { - if (y <= x) - return x; +template +struct count_bits_impl { + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits >> shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits << shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } +}; + +// Count leading zeros. +template +EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + return count_bits_impl::clz(bits); +} + +// Count trailing zeros. +template +EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return count_bits_impl::ctz(bits); +} + +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clz(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctz(static_cast(bits)); + } +}; + +template +struct count_bits_impl< + BitsType, typename enable_if::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzl(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctzl(static_cast(bits)); + } +}; + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzll(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctzll(static_cast(bits)); + } +}; + +#elif EIGEN_COMP_MSVC + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanReverse(&out, static_cast(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanForward(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#ifdef _WIN64 + +template +struct count_bits_impl< + BitsType, typename enable_if::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanReverse64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanForward64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#endif // _WIN64 + +#endif // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct random_default_impl { + static inline Scalar run(const Scalar& x, const Scalar& y) { + if (y <= x) return x; // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself. typedef typename make_unsigned::type ScalarU; // ScalarX is the widest of ScalarU and unsigned int. @@ -1032,11 +1174,15 @@ template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +#endif EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } +#endif EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_msvc_helper(x); } @@ -1050,12 +1196,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_ms #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only"))) #endif +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); } +#endif template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) { return __builtin_isinf(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) { return __builtin_isinf(x); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); } +#endif #undef EIGEN_TMP_NOOPT_ATTRIB @@ -1112,6 +1262,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) { return fmin(x, y); } + +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) @@ -1123,6 +1275,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) return fminl(x, y); #endif } +#endif template EIGEN_DEVICE_FUNC @@ -1142,6 +1295,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) { return fmax(x, y); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) @@ -1154,6 +1308,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) #endif } #endif +#endif #if defined(SYCL_DEVICE_ONLY) @@ -1310,8 +1465,8 @@ EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) return fabs(x - y); } -#if !defined(EIGEN_GPUCC) // HIP and CUDA do not support long double. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { diff --git a/Eigen/src/Core/Matrix.h b/Eigen/src/Core/Matrix.h index f0e59a9..29c3b5c 100644 --- a/Eigen/src/Core/Matrix.h +++ b/Eigen/src/Core/Matrix.h @@ -225,8 +225,6 @@ class Matrix return Base::_set(other); } - /* Here, doxygen failed to copy the brief information when using \copydoc */ - /** * \brief Copies the generic expression \a other into *this. * \copydetails DenseBase::operator=(const EigenBase &other) @@ -284,7 +282,15 @@ class Matrix #endif #if EIGEN_HAS_CXX11 - /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) + /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 + * + * \only_for_vectors + * + * This constructor is for 1D array or vectors with more than 4 coefficients. + * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. + * + * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this + * constructor must match the the fixed number of rows (resp. columns) of \c *this. * * Example: \include Matrix_variadic_ctor_cxx11.cpp * Output: \verbinclude Matrix_variadic_ctor_cxx11.out @@ -297,6 +303,8 @@ class Matrix : Base(a0, a1, a2, a3, args...) {} /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 + * + * \anchor matrix_constructor_initializer_list * * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: * @@ -480,16 +488,21 @@ class Matrix #define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \ /** \ingroup matrixtypedefs */ \ +/** \brief \noop */ \ typedef Matrix Matrix##SizeSuffix##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \noop */ \ typedef Matrix Vector##SizeSuffix##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \noop */ \ typedef Matrix RowVector##SizeSuffix##TypeSuffix; #define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size) \ /** \ingroup matrixtypedefs */ \ +/** \brief \noop */ \ typedef Matrix Matrix##Size##X##TypeSuffix; \ /** \ingroup matrixtypedefs */ \ +/** \brief \noop */ \ typedef Matrix Matrix##X##Size##TypeSuffix; #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \ diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 45c3a59..d93a7e3 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -206,28 +206,22 @@ template class MatrixBase EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - typedef typename internal::add_const >::type ConstDiagonalReturnType; + typedef Diagonal ConstDiagonalReturnType; EIGEN_DEVICE_FUNC - ConstDiagonalReturnType diagonal() const; - - template struct DiagonalIndexReturnType { typedef Diagonal Type; }; - template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; + const ConstDiagonalReturnType diagonal() const; template EIGEN_DEVICE_FUNC - typename DiagonalIndexReturnType::Type diagonal(); + Diagonal diagonal(); template EIGEN_DEVICE_FUNC - typename ConstDiagonalIndexReturnType::Type diagonal() const; - - typedef Diagonal DiagonalDynamicIndexReturnType; - typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; + const Diagonal diagonal() const; EIGEN_DEVICE_FUNC - DiagonalDynamicIndexReturnType diagonal(Index index); + Diagonal diagonal(Index index); EIGEN_DEVICE_FUNC - ConstDiagonalDynamicIndexReturnType diagonal(Index index) const; + const Diagonal diagonal(Index index) const; template struct TriangularViewReturnType { typedef TriangularView Type; }; template struct ConstTriangularViewReturnType { typedef const TriangularView Type; }; diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 72eac5a..7c2c50b 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -98,6 +98,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { } } // namespace numext +// clang-format off /** \class NumTraits * \ingroup Core_Module * @@ -109,45 +110,47 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) { * * The provided data consists of: * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real, - * then \c Real is just a typedef to \a T. If \a T is \c std::complex then \c Real + * then \c Real is just a typedef to \a T. If \a T is `std::complex` then \c Real * is a typedef to \a U. * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values, * such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives - * \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to + * \a T again. Note however that many Eigen functions such as `internal::sqrt` simply refuse to * take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is * only intended as a helper for code that needs to explicitly promote types. - * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex, Literal is defined as \c U. + * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex`, + * Literal is defined as \c U. * Of course, this type must be fully compatible with \a T. In doubt, just use \a T here. - * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what + * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what * this means, just use \a T here. - * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex + * \li An enum value \c IsComplex. It is equal to 1 if \a T is a \c std::complex * type, and to 0 otherwise. - * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int, + * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int, * and to \c 0 otherwise. - * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed + * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed * to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers. * Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost. - * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. - * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must + * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. + * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must * be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise. - * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), - * it returns a \a Real instead of a \a T. - * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default + * \li An `epsilon()` function which, unlike `std::numeric_limits::epsilon()`, + * it returns a \c Real instead of a \a T. + * \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default * value by the fuzzy comparison operators. - * \li highest() and lowest() functions returning the highest and lowest possible values respectively. - * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is + * \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively. + * \li `digits()` function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is * the analogue of std::numeric_limits::digits * which is used as the default implementation if specialized. - * \li digits10() function returning the number of decimal digits that can be represented without change. This is + * \li `digits10()` function returning the number of decimal digits that can be represented without change. This is * the analogue of std::numeric_limits::digits10 * which is used as the default implementation if specialized. - * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively, + * \li `min_exponent()` and `max_exponent()` functions returning the highest and lowest possible values, respectively, * such that the radix raised to the power exponent-1 is a normalized floating-point number. These are equivalent to - * std::numeric_limits::min_exponent/ - * std::numeric_limits::max_exponent. - * \li infinity() function returning a representation of positive infinity, if available. - * \li quiet_NaN function returning a non-signaling "not-a-number", if available. + * `std::numeric_limits::min_exponent`/ + * `std::numeric_limits::max_exponent`. + * \li `infinity()` function returning a representation of positive infinity, if available. + * \li `quiet_NaN` function returning a non-signaling "not-a-number", if available. */ + // clang-format on template struct GenericNumTraits { @@ -245,12 +248,25 @@ template<> struct NumTraits : GenericNumTraits static inline double dummy_precision() { return 1e-12; } }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct NumTraits : GenericNumTraits { - EIGEN_CONSTEXPR - static inline long double dummy_precision() { return 1e-15l; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double dummy_precision() { return static_cast(1e-15l); } + +#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106) + // PowerPC double double causes issues with some values + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double epsilon() + { + // 2^(-(__LDBL_MANT_DIG__)+1) + return static_cast(2.4651903288156618919116517665087e-32l); + } +#endif }; +#endif template struct NumTraits > : GenericNumTraits > diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 29abf35..17c06f0 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -54,12 +54,17 @@ struct packetwise_redux_traits /* Value to be returned when size==0 , by default let's return 0 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } +PacketType packetwise_redux_empty_value(const Func& ) { + const typename unpacket_traits::type zero(0); + return pset1(zero); +} /* For products the default is 1 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { + return pset1(Scalar(1)); +} /* Perform the actual reduction */ template class Ref inline Ref(DenseBase& expr) #endif { - EIGEN_STATIC_ASSERT(bool(internal::is_lvalue::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); - EIGEN_STATIC_ASSERT(bool(Traits::template match::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH); + EIGEN_STATIC_ASSERT((static_cast(internal::is_lvalue::value)), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); + EIGEN_STATIC_ASSERT((static_cast(Traits::template match::MatchAtCompileTime)), STORAGE_LAYOUT_DOES_NOT_MATCH); EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); // Construction must pass since we will not create temporary storage in the non-const case. const bool success = Base::construct(expr.const_cast_derived()); diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index 52de73b..882314c 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h @@ -250,7 +250,7 @@ class ReshapedImpl_dense EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride(); } protected: diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 5014610..e38b3d5 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -110,7 +110,7 @@ class SolverBase : public EigenBase } /** \internal the return type of transpose() */ - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; /** \returns an expression of the transposed of the factored matrix. * * A typical usage is to solve for the transposed problem A^T x = b: @@ -118,16 +118,16 @@ class SolverBase : public EigenBase * * \sa adjoint(), solve() */ - inline ConstTransposeReturnType transpose() const + inline const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } /** \internal the return type of adjoint() */ typedef typename internal::conditional::IsComplex, - CwiseUnaryOp, ConstTransposeReturnType>, - ConstTransposeReturnType - >::type AdjointReturnType; + CwiseUnaryOp, const ConstTransposeReturnType>, + const ConstTransposeReturnType + >::type AdjointReturnType; /** \returns an expression of the adjoint of the factored matrix * * A typical usage is to solve for the adjoint problem A' x = b: @@ -137,7 +137,7 @@ class SolverBase : public EigenBase * * \sa transpose(), solve() */ - inline AdjointReturnType adjoint() const + inline const AdjointReturnType adjoint() const { return AdjointReturnType(derived().transpose()); } diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 6494d51..d164e53 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -38,10 +38,14 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * - * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were * not allowed). * + * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1), + * the inner stride is the pointer increment between two consecutive elements, + * regardless of storage layout. + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 2bc658f..741504d 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -178,7 +178,7 @@ template class TransposeImpl * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Transpose +typename DenseBase::TransposeReturnType DenseBase::transpose() { return TransposeReturnType(derived()); @@ -191,7 +191,7 @@ DenseBase::transpose() * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename DenseBase::ConstTransposeReturnType +const typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index fdb8bc1..bd722cf 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -100,12 +100,10 @@ template class TriangularBase : public EigenBase return coeffRef(row,col); } - #ifndef EIGEN_PARSED_BY_DOXYGEN EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast(this); } EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast(this); } - #endif // not EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -442,7 +440,6 @@ template class TriangularViewImpl<_Mat EIGEN_DEVICE_FUNC TriangularViewType& operator=(const MatrixBase& other); -#ifndef EIGEN_PARSED_BY_DOXYGEN EIGEN_DEVICE_FUNC TriangularViewType& operator=(const TriangularViewImpl& other) { return *this = other.derived().nestedExpression(); } @@ -456,7 +453,6 @@ template class TriangularViewImpl<_Mat /** \deprecated */ EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase& other); -#endif /** Efficient triangular matrix times vector/matrix product */ template @@ -524,11 +520,7 @@ template class TriangularViewImpl<_Mat /** Swaps the coefficients of the common triangular parts of two matrices */ template EIGEN_DEVICE_FUNC -#ifdef EIGEN_PARSED_BY_DOXYGEN - void swap(TriangularBase &other) -#else void swap(TriangularBase const & other) -#endif { EIGEN_STATIC_ASSERT_LVALUE(OtherDerived); call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); @@ -552,9 +544,10 @@ template class TriangularViewImpl<_Mat this->solveInPlace(dst); } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta); + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, + bool beta); + protected: EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl) EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index ab7bd6c..c200620 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) @@ -167,15 +169,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } + EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { - Packet4cf num = pmul(a, pconj(b)); - __m256 tmp = _mm256_mul_ps(b.v, b.v); - __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); - __m256 denom = _mm256_add_ps(tmp, tmp2); - return Packet4cf(_mm256_div_ps(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip(const Packet4cf& x) @@ -321,10 +320,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { - Packet2cd num = pmul(a, pconj(b)); - __m256d tmp = _mm256_mul_pd(b.v, b.v); - __m256d denom = _mm256_hadd_pd(tmp, tmp); - return Packet2cd(_mm256_div_pd(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip(const Packet2cd& x) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 7fc32fd..24e01c4 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -285,11 +285,13 @@ template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { - return _mm256_sub_ps(_mm256_set1_ps(0.0),a); + const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); } template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) { - return _mm256_sub_pd(_mm256_set1_pd(0.0),a); + const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL)); + return _mm256_xor_pd(a, mask); } template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } @@ -628,11 +630,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { +#ifdef EIGEN_VECTORIZE_AVX512 + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from)); +#else Packet8i mask = _mm256_set1_epi8(static_cast(umask)); - const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe); mask = por(mask, bit_mask); mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); - EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +#if EIGEN_COMP_MSVC + // MSVC sometimes seems to use a bogus mask with maskstore. + const __m256i ifrom = _mm256_castps_si256(from); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast(to)); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast(to + 4)); +#else + EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from); +#endif +#endif } // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available @@ -1006,7 +1020,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #ifdef EIGEN_HAS_FP16_C - return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT); #else EIGEN_ALIGN32 float aux[8]; pstore(aux, a); diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 49c72b3..ebf00c2 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -37,7 +37,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { - return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) @@ -157,11 +159,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) { - Packet8cf num = pmul(a, pconj(b)); - __m512 tmp = _mm512_mul_ps(b.v, b.v); - __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); - __m512 denom = _mm512_add_ps(tmp, tmp2); - return Packet8cf(_mm512_div_ps(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) @@ -192,7 +190,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -253,11 +251,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { - #ifdef EIGEN_VECTORIZE_AVX512DQ - return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); - #else return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); - #endif } template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { @@ -309,47 +303,11 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) { - Packet4cd num = pmul(a, pconj(b)); - __m512d tmp = _mm512_mul_pd(b.v, b.v); - __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); - return Packet4cd(_mm512_div_pd(num.v, denom)); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) @@ -408,6 +366,8 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] } +#if EIGEN_HAS_AVX512_MATH + template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { return psqrt_complex(a); } @@ -416,6 +376,8 @@ template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { return psqrt_complex(a); } +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 6fd726d..017d6bf 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -14,8 +14,7 @@ namespace Eigen { namespace internal { -// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 +#if EIGEN_HAS_AVX512_MATH #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) @@ -326,7 +325,7 @@ Packet16f pexpm1(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) -#endif +#endif // EIGEN_HAS_AVX512_MATH template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 34d49ab..4ab100c 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -28,6 +28,13 @@ namespace internal { #endif #endif +// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif + typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; @@ -72,12 +79,14 @@ struct packet_traits : default_packet_traits { HasMax = 1, HasConj = 1, HasSetLinear = 0, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasLog = EIGEN_HAS_AVX512_MATH, + HasLog1p = EIGEN_HAS_AVX512_MATH, + HasExp = EIGEN_HAS_AVX512_MATH, + HasExpm1 = EIGEN_HAS_AVX512_MATH, + HasSqrt = EIGEN_HAS_AVX512_MATH, + HasRsqrt = EIGEN_HAS_AVX512_MATH, + HasBessel = EIGEN_HAS_AVX512_MATH, + HasNdtri = EIGEN_HAS_AVX512_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, @@ -86,9 +95,7 @@ struct packet_traits : default_packet_traits { HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, - HasBessel = 1, - HasNdtri = 1 + HasRint = 1 }; }; @@ -109,7 +116,7 @@ template<> struct packet_traits : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -138,7 +145,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasExp = 1, HasSqrt = EIGEN_FAST_MATH, @@ -289,11 +296,20 @@ EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - return _mm512_sub_ps(_mm512_set1_ps(0.0), a); + // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results. + // The intel docs give it a relatively high latency as well, so we're probably + // better off with using _mm512_set_epi32 directly anyways. + const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000); + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask)); } template <> EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - return _mm512_sub_pd(_mm512_set1_pd(0.0), a); + const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, + 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL); + return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask)); } template <> @@ -686,7 +702,7 @@ EIGEN_STRONG_INLINE Packet8d pload(const double* from) { template <> EIGEN_STRONG_INLINE Packet16i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( - reinterpret_cast(from)); + reinterpret_cast(from)); } template <> @@ -1426,60 +1442,11 @@ ploadquad(const Eigen::half* from) { } EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtph_ps(a); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif } EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - return _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif } template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { @@ -1852,7 +1819,7 @@ struct packet_traits : default_packet_traits { HasInsert = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, // Currently fails test with bad accuracy. HasLog1p = 1, diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index f424f11..b05d0d3 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -15,8 +15,10 @@ namespace Eigen { namespace internal { -static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; -#ifdef __VSX__ +inline Packet4ui p4ui_CONJ_XOR() { + return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +} +#ifdef EIGEN_VECTORIZE_VSX #if defined(_BIG_ENDIAN) static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -44,7 +46,7 @@ struct Packet2cf v1 = vec_madd(v1, b.v, p4f_ZERO); // multiply a_im * b and get the conjugate result v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR))); + v2 = reinterpret_cast(pxor(v2, reinterpret_cast(p4ui_CONJ_XOR()))); // permute back to a proper order v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); @@ -100,7 +102,8 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, -#ifdef __VSX__ + HasSqrt = 1, +#ifdef EIGEN_VECTORIZE_VSX HasBlend = 1, #endif HasSetLinear = 0 @@ -127,20 +130,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } -EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { Packet4f res0, res1; -#ifdef __VSX__ - __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); - __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); +#ifdef EIGEN_VECTORIZE_VSX + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); #ifdef _BIG_ENDIAN __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #else __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #endif #else - *reinterpret_cast *>(&res0) = *from0; - *reinterpret_cast *>(&res1) = *from1; + *reinterpret_cast *>(&res0) = from0; + *reinterpret_cast *>(&res1) = from1; res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); #endif return Packet2cf(res0); @@ -164,7 +167,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v, b.v)); } @@ -210,10 +213,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res = pmul(a, pconj(b)); - Packet4f s = pmul(b.v, b.v); - return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) @@ -233,21 +233,21 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV))); } -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { Packet2cf result; result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); return result; } -#endif template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { return psqrt_complex(a); } +#endif //---------- double ---------- -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} @@ -320,6 +320,7 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, + HasSqrt = 1, HasSetLinear = 0 }; }; @@ -375,10 +376,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for AltiVec - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = pmul(b.v, b.v); - return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) @@ -409,7 +407,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd psqrt(const Packet1cd& a) return psqrt_complex(a); } -#endif // __VSX__ +#endif // EIGEN_VECTORIZE_VSX } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 3a7a329..e3d7616 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -40,16 +40,14 @@ Packet4f pcos(const Packet4f& _x) return pcos_float(_x); } +#ifdef EIGEN_VECTORIZE_VSX #ifndef EIGEN_COMP_CLANG template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { return vec_rsqrt(x); } -#endif -#ifdef __VSX__ -#ifndef EIGEN_COMP_CLANG template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { @@ -57,7 +55,7 @@ Packet2d prsqrt(const Packet2d& x) } #endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { return vec_sqrt(x); @@ -69,12 +67,43 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +#if !EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f prsqrt(const Packet4f& x) +{ + return pset1(1.0f) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d prsqrt(const Packet2d& x) +{ + return pset1(1.0) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { return pexp_double(_x); } -#endif + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} + +#endif // EIGEN_VECTORIZE_VSX // Hyperbolic Tangent function. template <> diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 3f79b97..ea77496 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -17,24 +17,35 @@ #include "MatrixProductCommon.h" -// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX -#if EIGEN_COMP_LLVM -#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) -#ifdef __MMA__ -#define EIGEN_ALTIVEC_MMA_ONLY -#else -#define EIGEN_ALTIVEC_DISABLE_MMA -#endif -#endif +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#define EIGEN_ALTIVEC_DISABLE_MMA 0 #endif -#ifdef __has_builtin +// Check for MMA builtin support. +#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin) #if __has_builtin(__builtin_mma_assemble_acc) - #define ALTIVEC_MMA_SUPPORT + #define EIGEN_ALTIVEC_MMA_SUPPORT #endif #endif -#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) +// Check if and how we should actually use MMA if supported. +#if defined(EIGEN_ALTIVEC_MMA_SUPPORT) + +#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) +#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0 +#endif + +// Check if we want to enable dynamic dispatch. Not supported by LLVM. +#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM +#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1 +// Otherwise, use MMA by default if available. +#elif defined(__MMA__) +#define EIGEN_ALTIVEC_MMA_ONLY 1 +#endif + +#endif // EIGEN_ALTIVEC_MMA_SUPPORT + +#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) #include "MatrixProductMMA.h" #endif @@ -164,24 +175,23 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc rir += vectorDelta; } - if (j < cols) + + for(; j < cols; j++) { - rii = rir + ((cols - j) * rows); + rii = rir + rows; for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - std::complex v = getAdjointVal(i, k, rhs); + std::complex v = getAdjointVal(i, j, rhs); - blockBf[rir] = v.real(); - blockBf[rii] = v.imag(); + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); - rir += 1; - rii += 1; - } + rir += 1; + rii += 1; } + + rir += rows; } } @@ -260,19 +270,15 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs } } - if (j < cols) + for(; j < cols; j++) { for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - if(k <= i) - blockB[ri] = rhs(i, k); - else - blockB[ri] = rhs(k, i); - ri += 1; - } + if(j <= i) + blockB[ri] = rhs(i, j); + else + blockB[ri] = rhs(j, i); + ri += 1; } } } @@ -406,22 +412,18 @@ struct symm_pack_lhs * and offset and behaves accordingly. **/ -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) -{ - const Index size = 16 / sizeof(Scalar); - pstore(to + (0 * size), block.packet[0]); - pstore(to + (1 * size), block.packet[1]); - pstore(to + (2 * size), block.packet[2]); - pstore(to + (3 * size), block.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); pstore(to + (1 * size), block.packet[1]); + if (N > 2) { + pstore(to + (2 * size), block.packet[2]); + } + if (N > 3) { + pstore(to + (3 * size), block.packet[3]); + } } // General template for lhs & rhs complex packing. @@ -447,9 +449,9 @@ struct dhs_cpack { PacketBlock cblock; if (UseLhs) { - bload(cblock, lhs, j, i); + bload(cblock, lhs, j, i); } else { - bload(cblock, lhs, i, j); + bload(cblock, lhs, i, j); } blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); @@ -476,8 +478,8 @@ struct dhs_cpack { ptranspose(blocki); } - storeBlock(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 4*vectorSize; rii += 4*vectorSize; @@ -497,21 +499,12 @@ struct dhs_cpack { cblock.packet[1] = lhs.template loadPacket(i, j + 2); } } else { - std::complex lhs0, lhs1; if (UseLhs) { - lhs0 = lhs(j + 0, i); - lhs1 = lhs(j + 1, i); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(j + 2, i); - lhs1 = lhs(j + 3, i); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i)); + cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i)); } else { - lhs0 = lhs(i, j + 0); - lhs1 = lhs(i, j + 1); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(i, j + 2); - lhs1 = lhs(i, j + 3); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1)); + cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3)); } } @@ -533,34 +526,50 @@ struct dhs_cpack { rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) rir += (offset*(rows - j - vectorSize)); - rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + if(PanelMode) rir -= (offset*(vectorSize - 1)); - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + rii = rir + ((PanelMode) ? stride : depth); + + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockAt[rir] = lhs(i, j).real(); + + if(Conjugate) + blockAt[rii] = -lhs(i, j).imag(); + else + blockAt[rii] = lhs(i, j).imag(); + + rir += 1; + rii += 1; + } + + rir += ((PanelMode) ? (2*stride - depth) : depth); + } + } else { + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockAt[rir] = lhs(k, i).real(); if(Conjugate) blockAt[rii] = -lhs(k, i).imag(); else blockAt[rii] = lhs(k, i).imag(); - } else { - blockAt[rir] = lhs(i, k).real(); - if(Conjugate) - blockAt[rii] = -lhs(i, k).imag(); - else - blockAt[rii] = lhs(i, k).imag(); + rir += 1; + rii += 1; } - - rir += 1; - rii += 1; } } } @@ -586,16 +595,16 @@ struct dhs_pack{ PacketBlock block; if (UseLhs) { - bload(block, lhs, j, i); + bload(block, lhs, j, i); } else { - bload(block, lhs, i, j); + bload(block, lhs, i, j); } if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { ptranspose(block); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 4*vectorSize; } @@ -630,22 +639,34 @@ struct dhs_pack{ if(PanelMode) ri += vectorSize*(stride - offset - depth); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) ri += offset*(rows - j); + if(PanelMode) ri += offset; - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + for(Index i = 0; i < depth; i++) { - if (UseLhs) { - blockA[ri] = lhs(k, i); - } else { - blockA[ri] = lhs(i, k); - } + blockA[ri] = lhs(i, j); ri += 1; } + + if(PanelMode) ri += stride - depth; + } + } else { + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { + blockA[ri] = lhs(k, i); + ri += 1; + } + } } } } @@ -680,7 +701,7 @@ struct dhs_pack(j, i + 1); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 2*vectorSize; } @@ -757,7 +778,7 @@ struct dhs_pack(i + 1, j + 0); //[b1 b2] block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] - storeBlock(blockB + ri, block); + storeBlock(blockB + ri, block); } ri += 4*vectorSize; @@ -788,19 +809,17 @@ struct dhs_pack(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -941,7 +960,7 @@ struct dhs_cpack cblock; PacketBlock blockr, blocki; - bload(cblock, rhs, i, j); + bload(cblock, rhs, i, j); blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); @@ -955,8 +974,8 @@ struct dhs_cpack(blockBt + rir, blockr); - storeBlock(blockBt + rii, blocki); + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -965,27 +984,26 @@ struct dhs_cpack -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) -{ - if(NegativeAccumulate) - { - acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); - } else { - acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); - } -} - -template -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + } } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); + } } } @@ -1028,11 +1047,11 @@ EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con { Packet lhsV = pload(lhs); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1044,32 +1063,32 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In #endif } -template -EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV; - loadPacketRemaining(lhs, lhsV, remaining_rows); + loadPacketRemaining(lhs, lhsV); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { - pger_common(accReal, lhsV, rhsV); + pger_common(accReal, lhsV, rhsV); if(LhsIsReal) { - pger_common(accImag, lhsV, rhsVi); + pger_common(accImag, lhsV, rhsVi); EIGEN_UNUSED_VARIABLE(lhsVi); } else { if (!RhsIsReal) { - pger_common(accReal, lhsVi, rhsVi); - pger_common(accImag, lhsV, rhsVi); + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); } else { EIGEN_UNUSED_VARIABLE(rhsVi); } - pger_common(accImag, lhsVi, rhsV); + pger_common(accImag, lhsVi, rhsV); } } @@ -1084,8 +1103,8 @@ EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1101,11 +1120,11 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar #endif } -template -EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV, lhsVi; - loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); + loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi); pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } @@ -1117,132 +1136,142 @@ EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) } // Zero the accumulator on PacketBlock. -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) -{ - acc.packet[0] = pset1((Scalar)0); - acc.packet[1] = pset1((Scalar)0); - acc.packet[2] = pset1((Scalar)0); - acc.packet[3] = pset1((Scalar)0); -} - -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); + if (N > 1) { + acc.packet[1] = pset1((Scalar)0); + } + if (N > 2) { + acc.packet[2] = pset1((Scalar)0); + } + if (N > 3) { + acc.packet[3] = pset1((Scalar)0); + } } // Scale the PacketBlock vectors by alpha. -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); - acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); - acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); - acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); + if (N > 1) { + acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); + } + if (N > 2) { + acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); + } + if (N > 3) { + acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); + } } -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); -} - -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmul(accZ.packet[0], pAlpha); - acc.packet[1] = pmul(accZ.packet[1], pAlpha); - acc.packet[2] = pmul(accZ.packet[2], pAlpha); - acc.packet[3] = pmul(accZ.packet[3], pAlpha); -} - -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); + if (N > 1) { + acc.packet[1] = pmul(accZ.packet[1], pAlpha); + } + if (N > 2) { + acc.packet[2] = pmul(accZ.packet[2], pAlpha); + } + if (N > 3) { + acc.packet[3] = pmul(accZ.packet[3], pAlpha); + } } // Complex version of PacketBlock scaling. template EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { - bscalec_common(cReal, aReal, bReal); + bscalec_common(cReal, aReal, bReal); - bscalec_common(cImag, aImag, bReal); + bscalec_common(cImag, aImag, bReal); - pger_common(&cReal, bImag, aImag.packet); + pger_common(&cReal, bImag, aImag.packet); - pger_common(&cImag, bImag, aReal.packet); + pger_common(&cImag, bImag, aReal.packet); } -template -EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); - acc.packet[1] = pand(acc.packet[1], pMask); - acc.packet[2] = pand(acc.packet[2], pMask); - acc.packet[3] = pand(acc.packet[3], pMask); + if (N > 1) { + acc.packet[1] = pand(acc.packet[1], pMask); + } + if (N > 2) { + acc.packet[2] = pand(acc.packet[2], pMask); + } + if (N > 3) { + acc.packet[3] = pand(acc.packet[3], pMask); + } } -template -EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { - band(aReal, pMask); - band(aImag, pMask); + band(aReal, pMask); + band(aImag, pMask); - bscalec(aReal, aImag, bReal, bImag, cReal, cImag); + bscalec(aReal, aImag, bReal, bImag, cReal, cImag); } // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); + acc.packet[0] = res.template loadPacket(row + 0, col); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row + 1, col); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row + 2, col); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row + 3, col); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + 0, col + accCols); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + 1, col + accCols); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + 2, col + accCols); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + 3, col + accCols); + } + } } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); + acc.packet[0] = res.template loadPacket(row, col + 0); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row, col + 1); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row, col + 2); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row, col + 3); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + accCols, col + 0); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + accCols, col + 1); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + accCols, col + 2); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + accCols, col + 3); + } + } } } -// An overload of bload when you have a PacketBLock with 8 vectors. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); - acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); - acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); - acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); - } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); - acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); - acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); - acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); - } -} - -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); -} - const static Packet4i mask41 = { -1, 0, 0, 0 }; const static Packet4i mask42 = { -1, -1, 0, 0 }; const static Packet4i mask43 = { -1, -1, -1, 0 }; @@ -1273,22 +1302,44 @@ EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) } } -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { - band(accZ, pMask); + band(accZ, pMask); - bscale(acc, accZ, pAlpha); + bscale(acc, accZ, pAlpha); } -template -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +template EIGEN_ALWAYS_INLINE void +pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - pbroadcast4(a, a0, a1, a2, a3); + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } } template<> -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + pbroadcast4(a, a0, a1, a2, a3); +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1298,26 +1349,96 @@ EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 a3 = vec_splat(a3, 1); } -// PEEL loop factor. -#define PEEL 7 - -template -EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows, - Index remaining_cols) +template EIGEN_ALWAYS_INLINE void +pbroadcastN(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } } -template -EIGEN_STRONG_INLINE void gemm_extra_col( +template<> EIGEN_ALWAYS_INLINE void +pbroadcastN(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +// PEEL loop factor. +#define PEEL 7 +#define PEEL_ROW 7 + +#define MICRO_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_ZERO_PEEL(peel) \ + if ((PEEL_ROW > peel) && (peel != 0)) { \ + bsetzero(accZero##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##peel); \ + } + +#define MICRO_ZERO_PEEL_ROW \ + MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL); + +#define MICRO_WORK_PEEL(peel) \ + if (PEEL_ROW > peel) { \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + pger(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + } + +#define MICRO_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \ + MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \ + lhs_ptr += (remaining_rows * PEEL_ROW); \ + rhs_ptr += (accRows * PEEL_ROW); + +#define MICRO_ADD_PEEL(peel, sum) \ + if (PEEL_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accZero##sum.packet[i] += accZero##peel.packet[i]; \ + } \ + } + +#define MICRO_ADD_PEEL_ROW \ + MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \ + MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0) + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( + const Scalar* &lhs_ptr, + const Scalar* &rhs_ptr, + PacketBlock &accZero) +{ + Packet rhsV[4]; + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; +} + +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1326,61 +1447,60 @@ EIGEN_STRONG_INLINE void gemm_extra_col( Index offsetA, Index row, Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha) + Index rows, + Index cols, + const Packet& pAlpha, + const Packet& pMask) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc; - bsetzero(accZero); + bsetzero(accZero0); - Index remaining_depth = (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); - } + if (remaining_depth >= PEEL_ROW) { + MICRO_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + MICRO_WORK_PEEL_ROW + } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth); + MICRO_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero0); } - for(; k < depth; k++) + + if ((remaining_depth == depth) && (rows >= accCols)) { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; - } + bload(acc, res, row, 0); + bscale(acc, accZero0, pAlpha, pMask); + res.template storePacketBlock(row, 0, acc); + } else { + for(; k < depth; k++) + { + Packet rhsV[4]; + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero0, lhs_ptr, rhsV); + lhs_ptr += remaining_rows; + rhs_ptr += accRows; + } - accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col) += accZero.packet[0][i]; + for(Index j = 0; j < accRows; j++) { + accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]); + for(Index i = 0; i < remaining_rows; i++) { + res(row + i, j) += accZero0.packet[j][i]; + } + } } } -template -EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows) -{ - Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += accRows; -} - template -EIGEN_STRONG_INLINE void gemm_extra_row( +EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1395,52 +1515,20 @@ EIGEN_STRONG_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero, acc; - - bsetzero(accZero); - - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); - Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } - } - for(; k < remaining_depth; k++) - { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } - - if ((remaining_depth == depth) && (rows >= accCols)) - { - for(Index j = 0; j < 4; j++) { - acc.packet[j] = res.template loadPacket(row, col + j); - } - bscale(acc, accZero, pAlpha, pMask); - res.template storePacketBlock(row, col, acc); - } else { - for(; k < depth; k++) - { - Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += accRows; - } - - for(Index j = 0; j < 4; j++) { - accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); - } - for(Index j = 0; j < 4; j++) { - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col + j) += accZero.packet[j][i]; + switch(remaining_rows) { + case 1: + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); } - } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + } + break; } } @@ -1462,34 +1550,24 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_WORK_ONE(iter, peel) \ if (unroll_factor > iter) { \ - pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ } #define MICRO_TYPE_PEEL4(func, func2, peel) \ if (PEEL > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ - MICRO_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - } - -#define MICRO_TYPE_PEEL1(func, func2, peel) \ - if (PEEL > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ MICRO_UNROLL_WORK(func, func2, peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \ func(func1,func2,0); func(func1,func2,1); \ func(func1,func2,2); func(func1,func2,3); \ func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,6); func(func1,func2,7); #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M]; \ @@ -1503,17 +1581,9 @@ EIGEN_STRONG_INLINE void gemm_extra_row( MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ rhs_ptr += accRows; -#define MICRO_ONE_PEEL1 \ - MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += (remaining_cols * PEEL); - -#define MICRO_ONE1 \ - MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += remaining_cols; - #define MICRO_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accZero##iter); \ + bsetzero(accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } @@ -1522,7 +1592,7 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -1538,25 +1608,13 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ - acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ - acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, 0, acc); \ } #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) -#define MICRO_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ - } - -#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const DataMapper& res, @@ -1564,15 +1622,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; MICRO_SRC_PTR MICRO_DST_PTR @@ -1593,101 +1649,100 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( row += unroll_factor*accCols; } -template -EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha) -{ - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; - - MICRO_SRC_PTR - MICRO_DST_PTR - - Index k = 0; - for(; k + PEEL <= depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - MICRO_PREFETCH - MICRO_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_ONE1 - } - MICRO_COL_STORE - - row += unroll_factor*accCols; -} - -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha) + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + #define MAX_UNROLL 6 while(row + MAX_UNROLL*accCols <= rows) { - gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); } switch( (rows-row)/accCols ) { #if MAX_UNROLL > 7 case 7: - gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 6 case 6: - gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 5 - case 5: - gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 5: + gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 4 - case 4: - gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 4: + gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 3 - case 3: - gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 3: + gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 2 - case 2: - gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 2: + gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 1 - case 1: - gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 1: + gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif - default: - break; + default: + break; } #undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} + +template +EIGEN_STRONG_INLINE void gemm_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + for (; col < cols; col++) { + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } } /**************** @@ -1697,7 +1752,6 @@ template(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_UNROLL > 7 - case 7: - gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 6 - case 6: - gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 5 - case 5: - gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 4 - case 4: - gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 3 - case 3: - gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 2 - case 2: - gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 1 - case 1: - gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } - - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } - } + + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -1789,29 +1774,76 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const // PEEL_COMPLEX loop factor. #define PEEL_COMPLEX 3 +#define PEEL_COMPLEX_ROW 3 -template -EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( +#define MICRO_COMPLEX_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) + +#define MICRO_COMPLEX_ZERO_PEEL(peel) \ + if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \ + bsetzero(accReal##peel); \ + bsetzero(accImag##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##peel); \ + EIGEN_UNUSED_VARIABLE(accImag##peel); \ + } + +#define MICRO_COMPLEX_ZERO_PEEL_ROW \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL); + +#define MICRO_COMPLEX_WORK_PEEL(peel) \ + if (PEEL_COMPLEX_ROW > peel) { \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + pgerc(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ + } + +#define MICRO_COMPLEX_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \ + Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \ + lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \ + if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \ + rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); + +#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \ + if (PEEL_COMPLEX_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accReal##sum.packet[i] += accReal##peel.packet[i]; \ + accImag##sum.packet[i] += accImag##peel.packet[i]; \ + } \ + } + +#define MICRO_COMPLEX_ADD_PEEL_ROW \ + MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ + MICRO_COMPLEX_ADD_PEEL(1, 0) + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows, - Index remaining_cols) + PacketBlock &accReal, PacketBlock &accImag) { - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + Packet rhsV[4], rhsVi[4]; + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); lhs_ptr_real += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1821,95 +1853,94 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col( Index strideB, Index row, Index col, - Index remaining_rows, - Index remaining_cols, + Index rows, + Index cols, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; + const Scalar* rhs_ptr_imag = NULL; + if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; + const Scalar* lhs_ptr_imag = NULL; if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; + PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; - bsetzero(accReal); - bsetzero(accImag); + bsetzero(accReal0); + bsetzero(accImag0); - Index remaining_depth = (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); - } + if (remaining_depth >= PEEL_COMPLEX_ROW) { + MICRO_COMPLEX_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + MICRO_COMPLEX_WORK_PEEL_ROW + } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth); + MICRO_COMPLEX_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0); } - for(; k < depth; k++) + if ((remaining_depth == depth) && (rows >= accCols)) { - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - } - - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); - - if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) - { - res(row + 0, col + 0) += pfirst(acc0.packet[0]); + bload(tRes, res, row, 0); + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + res.template storePacketBlock(row + 0, 0, acc0); + res.template storePacketBlock(row + accColsC, 0, acc1); } else { - acc0.packet[0] += res.template loadPacket(row + 0, col + 0); - res.template storePacketBlock(row + 0, col + 0, acc0); - if(remaining_rows > accColsC) { - res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); + for(; k < depth; k++) + { + Packet rhsV[4], rhsVi[4]; + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + lhs_ptr_real += remaining_rows; + if(!LhsIsReal) lhs_ptr_imag += remaining_rows; + rhs_ptr_real += accRows; + if(!RhsIsReal) rhs_ptr_imag += accRows; + } + + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); + + if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) + { + for(Index j = 0; j < accRows; j++) { + res(row + 0, j) += pfirst(acc0.packet[j]); + } + } else { + for(Index j = 0; j < accRows; j++) { + PacketBlock acc2; + acc2.packet[0] = res.template loadPacket(row + 0, j) + acc0.packet[j]; + res.template storePacketBlock(row + 0, j, acc2); + if(remaining_rows > accColsC) { + res(row + accColsC, j) += pfirst(acc1.packet[j]); + } + } } } } -template -EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( - const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, - const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows) -{ - Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += accRows; - if(!RhsIsReal) rhs_ptr_imag += accRows; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); -} - template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1926,101 +1957,39 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask) { - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; - if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; - - bsetzero(accReal); - bsetzero(accImag); - - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); - Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } - } - for(; k < remaining_depth; k++) - { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } - - if ((remaining_depth == depth) && (rows >= accCols)) - { - bload(tRes, res, row, col); - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); - bcouple(taccReal, taccImag, tRes, acc0, acc1); - res.template storePacketBlock(row + 0, col, acc0); - res.template storePacketBlock(row + accColsC, col, acc1); - } else { - for(; k < depth; k++) - { - Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += accRows; - if(!RhsIsReal) rhs_ptr_imag += accRows; - } - - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); - - if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) - { - for(Index j = 0; j < 4; j++) { - res(row + 0, col + j) += pfirst(acc0.packet[j]); + switch(remaining_rows) { + case 1: + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); } - } else { - for(Index j = 0; j < 4; j++) { - PacketBlock acc2; - acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; - res.template storePacketBlock(row + 0, col + j, acc2); - if(remaining_rows > accColsC) { - res(row + accColsC, col + j) += pfirst(acc1.packet[j]); - } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); } - } + break; } } #define MICRO_COMPLEX_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ MICRO_COMPLEX_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) + func(0,peel) func(1,peel) func(2,peel) func(3,peel) #define MICRO_COMPLEX_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -2028,37 +1997,16 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ if (unroll_factor > iter) { \ - pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ - } - -#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ - if (unroll_factor > iter) { \ - pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgerc_common(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ } #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ if(!RhsIsReal) { \ - pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } \ - MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } - -#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ - if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ - if(!RhsIsReal) { \ - rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ + pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ } else { \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ @@ -2069,13 +2017,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( } #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ - Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \ func(func1,func2,0); func(func1,func2,1); \ - func(func1,func2,2); func(func1,func2,3); \ - func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,2); func(func1,func2,3); #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M], rhsVi0[M];\ @@ -2091,20 +2036,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( rhs_ptr_real += accRows; \ if(!RhsIsReal) rhs_ptr_imag += accRows; -#define MICRO_COMPLEX_ONE_PEEL1 \ - MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ - if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); - -#define MICRO_COMPLEX_ONE1 \ - MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += remaining_cols; \ - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accReal##iter); \ - bsetzero(accImag##iter); \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accReal##iter); \ EIGEN_UNUSED_VARIABLE(accImag##iter); \ @@ -2114,15 +2049,9 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) @@ -2130,35 +2059,21 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) #define MICRO_COMPLEX_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + bload(tRes, res, row + iter*accCols, 0); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, 0, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, 0, acc1); \ } #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) -#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ - } - -#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const DataMapper& res, @@ -2166,29 +2081,26 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_DST_PTR @@ -2212,112 +2124,93 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( row += unroll_factor*accCols; } -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_complex_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag) -{ - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) { - rhs_ptr_imag = rhs_base + remaining_cols*strideB; - } else { - EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; - - MICRO_COMPLEX_SRC_PTR - MICRO_COMPLEX_DST_PTR - - Index k = 0; - for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - MICRO_COMPLEX_PREFETCH - MICRO_COMPLEX_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_COMPLEX_ONE1 - } - MICRO_COMPLEX_COL_STORE - - row += unroll_factor*accCols; -} - -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index& row, Index rows, - Index col, - Index remaining_cols, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + #define MAX_COMPLEX_UNROLL 3 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); } switch( (rows-row)/accCols ) { #if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 4: + gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; #endif #if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 3: + gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; #endif #if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 2: + gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; #endif #if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; + case 1: + gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; #endif - default: - break; + default: + break; } #undef MAX_COMPLEX_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } +} + +template +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + for (; col < cols; col++) { + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } } template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -2332,64 +2225,10 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC @@ -2649,10 +2488,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2662,7 +2501,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2688,20 +2527,20 @@ void gebp_kernel, std::complex, Index, DataMapper, mr void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2726,20 +2565,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2764,20 +2603,20 @@ void gebp_kernel, float, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const float*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2801,10 +2640,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2814,7 +2653,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2839,20 +2678,20 @@ void gebp_kernel, std::complex, Index, DataMapper, const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2877,20 +2716,20 @@ void gebp_kernel, double, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const double*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2915,20 +2754,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } } // end namespace internal diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 33d5434..bf01dba 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -9,22 +9,8 @@ namespace Eigen { namespace internal { -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha); - template -EIGEN_STRONG_INLINE void gemm_extra_row( +EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( +template +EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, - Index rows, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha); + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask); template EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag); - -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -91,123 +64,88 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pMask); template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, - Index rows, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag); + const Packet& pAlphaImag, + const Packet& pMask); template EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); template EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); -const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, - 16, 17, 18, 19, - 4, 5, 6, 7, - 20, 21, 22, 23}; - -const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, - 24, 25, 26, 27, - 12, 13, 14, 15, - 28, 29, 30, 31}; -//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 -const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, - 16, 17, 18, 19, 20, 21, 22, 23}; - -//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 -const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31}; - - // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); + acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]); + } - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); + acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]); + } } -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { - bcouple_common(taccReal, taccImag, acc1, acc2); + bcouple_common(taccReal, taccImag, acc1, acc2); acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); - acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); - acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + if (N > 1) { + acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); + } + if (N > 2) { + acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); + } + if (N > 3) { + acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + } - acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); - acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); - acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); - acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); -} - -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) -{ - bcouple_common(taccReal, taccImag, acc1, acc2); - - acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - - acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); -} - -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); -} - -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); + acc2.packet[0] = padd(tRes.packet[0+N], acc2.packet[0]); + if (N > 1) { + acc2.packet[1] = padd(tRes.packet[1+N], acc2.packet[1]); + } + if (N > 2) { + acc2.packet[2] = padd(tRes.packet[2+N], acc2.packet[2]); + } + if (N > 3) { + acc2.packet[3] = padd(tRes.packet[3+N], acc2.packet[3]); + } } // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 6540c6f..7dda423 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,7 +11,11 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H -#pragma GCC target("cpu=power10") +// If using dynamic dispatch, set the CPU target. +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC push_options +#pragma GCC target("cpu=power10,htm") +#endif #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) @@ -30,37 +34,37 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) } template -EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); - bscale(tRes, result, alpha); + bscale(tRes, result, alpha); - data.template storePacketBlock(i, j, tRes); + data.template storePacketBlock(i, 0, tRes); } -template -EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) { PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); __builtin_mma_disassemble_acc(&resultImag.packet, accImag); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); PacketBlock taccReal, taccImag; bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); PacketBlock acc1, acc2; - bcouple(taccReal, taccImag, tRes, acc1, acc2); + bcouple(taccReal, taccImag, tRes, acc1, acc2); - data.template storePacketBlock(i + N*accColsC, j, acc1); - data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); + data.template storePacketBlock(i, 0, acc1); + data.template storePacketBlock(i + accColsC, 0, acc2); } // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments @@ -125,7 +129,7 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag template EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { - rhsV = ploadRhs((const Scalar*)(rhs)); + rhsV = ploadRhs(rhs); } template<> @@ -184,12 +188,11 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) } #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0; \ @@ -222,7 +225,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) #define MICRO_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -238,21 +241,19 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + storeAccumulator(row + iter*accCols, res, pAlpha, &accZero##iter); \ } #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; @@ -278,11 +279,84 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( row += unroll_factor*accCols; } +template +EIGEN_ALWAYS_INLINE void gemmMMA_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_MMA_UNROLL 7 + while(row + MAX_MMA_UNROLL*accCols <= rows) { + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + } + switch( (rows-row)/accCols ) { +#if MAX_MMA_UNROLL > 7 + case 7: + gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 6 + case 6: + gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 5 + case 5: + gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 4 + case 4: + gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 3 + case 3: + gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 2 + case 2: + gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif +#if MAX_MMA_UNROLL > 1 + case 1: + gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; +#endif + default: + break; + } +#undef MAX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} + template void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -293,79 +367,10 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - - Index row = 0; -#define MAX_MMA_UNROLL 7 - while(row + MAX_MMA_UNROLL*accCols <= rows) { - gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_MMA_UNROLL > 7 - case 7: - gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 6 - case 6: - gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 5 - case 5: - gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 4 - case 4: - gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 3 - case 3: - gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 2 - case 2: - gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_MMA_UNROLL > 1 - case 1: - gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } + gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; - } - } + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -373,21 +378,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define advanceCols ((RhsIsReal) ? 1 : 2) // PEEL_COMPLEX_MMA loop factor. -#define PEEL_COMPLEX_MMA 7 +#define PEEL_COMPLEX_MMA 3 #define MICRO_COMPLEX_MMA_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -400,8 +404,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ if (PEEL_COMPLEX_MMA > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ if(!RhsIsReal) { \ ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ @@ -409,20 +413,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ MICRO_COMPLEX_MMA_UNROLL(func2); \ - func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ + type rhsV0, rhsV1, rhsV2, rhsV3; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0, rhsVi0; \ @@ -459,15 +460,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) @@ -475,45 +470,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ } #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_DST_PTR @@ -537,11 +527,70 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( row += unroll_factor*accCols; } +template +EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } +} + template void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -556,74 +605,23 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_MMA_UNROLL 4 - while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { - gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_MMA_UNROLL > 4 - case 4: - gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 3 - case 3: - gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 2 - case 2: - gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 1 - case 1: - gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC #undef advanceRows #undef advanceCols -#pragma GCC reset_options } // end namespace internal } // end namespace Eigen +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC pop_options +#endif + #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H diff --git a/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h new file mode 100644 index 0000000..bb84ac9 --- /dev/null +++ b/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.h @@ -0,0 +1,2400 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H +#define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + +#include "../../InternalHeaderCheck.h" + +#if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA +#if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3) +#define USE_GEMV_MMA +#endif + +#if !EIGEN_COMP_LLVM && (__GNUC__ == 10 && __GNUC_MINOR__ <= 3) +// Only allow one vector_pair in buggy gcc - gcc 10.3 has a bug +#define GCC_ONE_VECTORPAIR_BUG +#endif +#endif + +//#define USE_SLOWER_GEMV_MMA // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is improved) + +//#define EIGEN_POWER_USE_GEMV_PREFETCH +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p) +#else +#define EIGEN_POWER_GEMV_PREFETCH(p) +#endif + +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if EIGEN_COMP_LLVM +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#if (__GNUC__ <= 10) +#if (__GNUC_MINOR__ > 3) +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1) +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#else +#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \ + __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2) +#endif +#endif + +#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16)) +#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float))) +#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16) +#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex))) + +/** \internal multiply and add and store results */ +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) +{ + pstoreu(res, pmadd(data, palpha, ploadu(res))); +} + +template +EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) +{ + *res += (alpha * data); +} + +#define GEMV_UNROLL(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) \ + func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2) + +#define GEMV_LOADPACKET_COL(iter) \ + lhs.template load(i + ((iter) * LhsPacketSize), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_VAR(iter, N, which) \ + if (GEMV_GETN(N) <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \ + if (N <= iter) { \ + EIGEN_UNUSED_VARIABLE(which##iter); \ + } + +#define GEMV_UNUSED_EXTRA(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which) + +#define GEMV_UNUSED(N, which) \ + GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&e##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1)); +#else +#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \ + const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \ + b##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src##iter1)); +#endif + +#define GEMV_LOAD1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + g##iter = GEMV_LOADPACKET_COL(iter); \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter); \ + EIGEN_UNUSED_VARIABLE(g##iter); \ + } + +#define GEMV_WORK1A_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&e##iter, a0, g##iter); \ + } else { \ + pger_vecMMA_acc(&e##iter, b##iter, a0); \ + } \ + } + +#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2) \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1) \ + GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(g##iter2); \ + EIGEN_UNUSED_VARIABLE(g##iter3); + +#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + LhsPacket h[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(h), &b##iter2); \ + pger_vecMMA_acc(&e##iter2, a0, h[0]); \ + pger_vecMMA_acc(&e##iter3, a0, h[1]); \ + } else { \ + pger_vecMMA_acc(&e##iter2, b##iter2, a0); \ + pger_vecMMA_acc(&e##iter3, b##iter3, a0); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) \ + } + +#define GEMV_WORK_COL_MMA(N) \ + if (GEMV_GETN(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N) + +#define GEMV_WORK_COL_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \ + if (!GEMV_IS_FLOAT) { \ + result##iter.packet[0][1] = result##iter.packet[1][0]; \ + result##iter.packet[2][1] = result##iter.packet[3][0]; \ + } \ + } + +#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \ + b##iter1 = *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)); + +#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(b##iter2); \ + EIGEN_UNUSED_VARIABLE(b##iter3); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + ResPacket f##iter2[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(f##iter2), &b##iter2); \ + f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]); \ + f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \ + GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]); +#else +#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4) \ + if (GEMV_IS_FLOAT) { \ + __asm__ ("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter3.packet[0]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } else { \ + __asm__ ("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3" : "+&d" (b##iter2) : "wa" (result##iter2.packet[2]), "wa" (result##iter2.packet[0]), "wa" (palpha)); \ + } +#endif + +#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2); \ + } else { \ + GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \ + GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \ + } \ + } + +#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \ + *reinterpret_cast<__vector_pair *>(res + i + ((iter2) * ResPacketSize)) = b##iter1; + +#define GEMV_STORE_COL_MMA(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + storeMaddData(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \ + } else { \ + GEMV_LOADPAIR2_COL_MMA(iter, iter << 1) \ + GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter, iter << 1) \ + } \ + } + +#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN(N) > iter1) { \ + if (GEMV_IS_FLOAT) { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2); \ + } else { \ + GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1) \ + GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1) \ + } \ + } + +#define GEMV_PROCESS_COL_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_MMA, N) \ + Index j = j2; \ + __vector_pair b0, b1, b2, b3, b4, b5, b6, b7; \ + do { \ + LhsPacket g0, g1, g2, g3, g4, g5, g6, g7; \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_MMA(N) \ + GEMV_WORK_COL_MMA(N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N) \ + if (GEMV_GETN(N) <= 1) { \ + GEMV_UNROLL(GEMV_STORE_COL_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1)) \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH +#define GEMV_PREFETCH(iter, N) \ + if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) { \ + lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \ + } +#else +#define GEMV_PREFETCH(iter, N) +#endif + +#define GEMV_WORK_COL(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \ + } + +#define GEMV_STORE_COL(iter, N) \ + if (N > iter) { \ + pstoreu(res + i + (iter * ResPacketSize), pmadd(c##iter, palpha, ploadu(res + i + (iter * ResPacketSize)))); \ + } + +/** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_ONE(N) \ + GEMV_UNROLL(GEMV_INIT, N) \ + Index j = j2; \ + do { \ + RhsPacket a0 = pset1(rhs2(j, 0)); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL, N) \ + i += (ResPacketSize * N); + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE_MMA(N) +#else +#define GEMV_PROCESS_COL(N) \ + GEMV_PROCESS_COL_ONE(N) +#endif + +/** \internal perform a matrix multiply and accumulate of packet a and packet b */ +#ifdef USE_GEMV_MMA +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) +{ + if (accumulate) + { + __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b); + } + else + { + __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b); + } +} +#endif + +template +EIGEN_STRONG_INLINE void gemv_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + conj_helper pcj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + ResPacket palpha = pset1(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#ifdef USE_GEMV_MMA + __vector_quad e0, e1, e2, e3, e4, e5, e6, e7; + PacketBlock result0, result1, result2, result3, result4, result5, result6, result7; + GEMV_UNUSED(8, e) + GEMV_UNUSED(8, result) + GEMV_UNUSED_EXTRA(1, c) +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + while (i < n8) + { + GEMV_PROCESS_COL(8) + } + if (i < n4) + { + GEMV_PROCESS_COL(4) + } + if (i < n2) + { + GEMV_PROCESS_COL(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +const Packet16uc p16uc_COMPLEX32_XORFLIP = { 0x44,0x55,0x66,0x77, 0x00,0x11,0x22,0x33, 0xcc,0xdd,0xee,0xff, 0x88,0x99,0xaa,0xbb }; +const Packet16uc p16uc_COMPLEX64_XORFLIP = { 0x88,0x99,0xaa,0xbb, 0xcc,0xdd,0xee,0xff, 0x00,0x11,0x22,0x33, 0x44,0x55,0x66,0x77 }; + +#ifdef _BIG_ENDIAN +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x80,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +#else +const Packet16uc p16uc_COMPLEX32_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 }; +const Packet16uc p16uc_COMPLEX32_NEGATE = { 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x80 }; +const Packet16uc p16uc_COMPLEX64_NEGATE = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80, 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x80 }; +#endif + +#ifdef _BIG_ENDIAN +#define COMPLEX_DELTA 0 +#else +#define COMPLEX_DELTA 2 +#endif + +/** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */ +EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) { + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR))); +} + +/** \internal packet conjugate with real & imaginary operation inverted */ +EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) { +#ifdef __POWER8_VECTOR__ + return Packet2cf(Packet4f(vec_neg(Packet2d(a.v)))); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_CONJ_XOR2))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) { + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_CONJ_XOR2))); +} + +#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12) +#define PERMXOR_GOOD // Clang had a bug with vec_permxor and endianness prior to version 12 +#endif + +/** \internal flip the real & imaginary results and packet conjugate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pconj2(a)); +#endif +} + +/** \internal packet conjugate and flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP))); +#else + return pconj2(pcplxflip(a)); +#endif +} + +/** \internal packet negate */ +EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) +{ +#ifdef __POWER8_VECTOR__ + return Packet2cf(vec_neg(a.v)); +#else + return Packet2cf(pxor(a.v, reinterpret_cast(p16uc_COMPLEX32_NEGATE))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) +{ +#ifdef __POWER8_VECTOR__ + return Packet1cd(vec_neg(a.v)); +#else + return Packet1cd(pxor(a.v, reinterpret_cast(p16uc_COMPLEX64_NEGATE))); +#endif +} + +/** \internal flip the real & imaginary results and negate */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) +{ +#ifdef PERMXOR_GOOD + return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) +{ +#ifdef PERMXOR_GOOD + return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP))); +#else + return pcplxflip(pnegate2(a)); +#endif +} + +/** \internal flip the real & imaginary results */ +EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) +{ + return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP))); +} + +EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) +{ +#ifdef EIGEN_VECTORIZE_VSX + return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2)); +#else + return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP))); +#endif +} + +/** \internal load half a vector with one complex value */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex* src) +{ + Packet4f t; +#ifdef EIGEN_VECTORIZE_VSX + // Load float64/two float32 (doubleword alignment) + __asm__("lxsdx %x0,%y1" : "=wa" (t) : "Z" (*src)); +#else + *reinterpret_cast*>(reinterpret_cast(&t) + COMPLEX_DELTA) = *src; +#endif + return t; +} + +/** \internal load two vectors from the real and imaginary portions of a complex value */ +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) +{ +#ifdef _ARCH_PWR9 + __asm__("lxvwsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvwsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet4f t = pload_complex_half(src); + r = vec_splat(t, COMPLEX_DELTA + 0); + i = vec_splat(t, COMPLEX_DELTA + 1); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) +{ +#ifdef EIGEN_VECTORIZE_VSX + __asm__("lxvdsx %x0,%y1" : "=wa" (r) : "Z" (*(reinterpret_cast(src) + 0))); + __asm__("lxvdsx %x0,%y1" : "=wa" (i) : "Z" (*(reinterpret_cast(src) + 1))); +#else + Packet2d t = ploadu(reinterpret_cast(src)); + r = vec_splat(t, 0); + i = vec_splat(t, 1); +#endif +} + +#ifndef __POWER8_VECTOR__ +const Packet16uc p16uc_MERGEE = { 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B }; + +const Packet16uc p16uc_MERGEO = { 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F }; +#endif + +/** \internal load two vectors from the interleaved real & imaginary values of src */ +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) +{ + Packet4f t = ploadu(reinterpret_cast(src)); +#ifdef __POWER8_VECTOR__ + r = vec_mergee(t, t); + i = vec_mergeo(t, t); +#else + r = vec_perm(t, t, p16uc_MERGEE); + i = vec_perm(t, t, p16uc_MERGEO); +#endif +} + +template +EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) +{ + return pload_realimag(src, r, i); +} + +/** \internal load and splat a complex value into a vector - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex* src) +{ +#ifdef EIGEN_VECTORIZE_VSX + Packet4f ret; + __asm__("lxvdsx %x0,%y1" : "=wa" (ret) : "Z" (*(reinterpret_cast(src) + 0))); + return ret; +#else + return Packet4f(ploaddup(reinterpret_cast(src))); +#endif +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a complex value into a vector - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a scalar or a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex* src) +{ + if (GEMV_IS_SCALAR) { + return pload_complex_half(src); + } + else + { + return ploadu(reinterpret_cast(src)); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex* src) +{ + return ploadu(reinterpret_cast(src)); +} + +/** \internal load from a complex vector and convert to a real vector */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) +{ + return src->v; +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) +{ + return src->v; +} + +/** \internal load a full vector from complex location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex* src) +{ + return Packet4f(ploaddup(reinterpret_cast(src))); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex* src) +{ + return ploadu(src).v; +} + +/** \internal load a full vector from complex location - row-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex* src) +{ + return ploadu(src).v; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex* src) +{ + return pload_complex_full(src); +} + +/** \internal load a vector from a real-only scalar location - column-wise */ +EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) +{ + return pset1(*src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) +{ + return src; +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) +{ + return src; +} + +/** \internal load a vector from a real-only vector location */ +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) +{ + Packet4f ret = ploadu(src); + return vec_mergeh(ret, ret); +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex* src) +{ + return pload_complex_full(src); // Just for compilation +} + +/** \internal load a vector from a real-only scalar location - row-wise */ +template +EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) +{ + if (GEMV_IS_SCALAR) { + return pload_real_full(src); + } + else { + return ploadu(src); + } +} + +template +EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) +{ + return pload_real(src); +} + +EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex& b) +{ + EIGEN_UNUSED_VARIABLE(b); + return a; // Just for compilation +} + +/** \internal set a scalar from complex location */ +template +EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) +{ + return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag()); +} + +/** \internal set a vector from complex location */ +template +EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex& alpha) +{ + Packet2cf ret; + ret.v[COMPLEX_DELTA + 0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[COMPLEX_DELTA + 1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0]; + ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1]; + return ret; +} + +template +EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex& alpha) +{ + Packet1cd ret; + ret.v[0] = pset1_realimag(alpha, (which & 0x01), (which & 0x04)); + ret.v[1] = pset1_realimag(alpha, (which & 0x02), (which & 0x08)); + return ret; +} + +/** \internal zero out a vector for real or complex forms */ +template +EIGEN_ALWAYS_INLINE Packet pset_zero() +{ + return pset1(__UNPACK_TYPE__(Packet)(0)); +} + +template<> +EIGEN_ALWAYS_INLINE Packet2cf pset_zero() +{ + return Packet2cf(pset1(float(0))); +} + +template<> +EIGEN_ALWAYS_INLINE Packet1cd pset_zero() +{ + return Packet1cd(pset1(double(0))); +} + +/** \internal initialize a vector from another vector */ +template +EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + EIGEN_UNUSED_VARIABLE(c1); + return pset_zero(); + } + else + { + return c1; // Intentionally left uninitialized + } +} + +template +struct alpha_store +{ + alpha_store(ResScalar& alpha) { + separate.r = pset1_complex(alpha); + separate.i = pset1_complex(alpha); + } + struct ri { + PResPacket r; + PResPacket i; + } separate; +}; + +/** \internal multiply and add for complex math */ +template +EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) +{ + return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4)); +} + +/** \internal store and madd for complex math */ +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + if (GEMV_IS_SCALAR) { + ScalarPacket c4 = ploadu(reinterpret_cast(res)); + ScalarPacket c3 = pmadd_complex(c0.v, c2.v, c4, b0); + pstoreu(reinterpret_cast(res), c3); + } else { + ScalarPacket c4 = pload_complex(res); + PResPacket c3 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + pstoreu(res, c3); + } +} + +template +EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) +{ + PResPacket c2 = pcplxflipconj(c0); + PResPacket c3 = pcplxflipconj(c1); +#if !defined(_ARCH_PWR10) + ScalarPacket c4 = pload_complex(res + (iter2 * ResPacketSize)); + ScalarPacket c5 = pload_complex(res + ((iter2 + 1) * ResPacketSize)); + PResPacket c6 = PResPacket(pmadd_complex(c0.v, c2.v, c4, b0)); + PResPacket c7 = PResPacket(pmadd_complex(c1.v, c3.v, c5, b0)); + pstoreu(res + (iter2 * ResPacketSize), c6); + pstoreu(res + ((iter2 + 1) * ResPacketSize), c7); +#else + __vector_pair a = *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)); +#if EIGEN_COMP_LLVM + PResPacket c6[2]; + __builtin_vsx_disassemble_pair(reinterpret_cast(c6), &a); + c6[0] = PResPacket(pmadd_complex(c0.v, c2.v, c6[0].v, b0)); + c6[1] = PResPacket(pmadd_complex(c1.v, c3.v, c6[1].v, b0)); + GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v); +#else + if (GEMV_IS_COMPLEX_FLOAT) { + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } else { + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.r.v), "wa" (c0.v), "wa" (c1.v)); + __asm__ ("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d" (a) : "wa" (b0.separate.i.v), "wa" (c2.v), "wa" (c3.v)); + } +#endif + *reinterpret_cast<__vector_pair *>(res + (iter2 * ResPacketSize)) = a; +#endif +} + +/** \internal load lhs packet */ +template +EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) +{ + if (sizeof(Scalar) == sizeof(LhsScalar)) { + const LhsScalar& src = lhs(i + 0, j); + return LhsPacket(pload_real_full(const_cast(&src))); + } + return lhs.template load(i + 0, j); +} + +/** \internal madd for complex times complex */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (ConjugateLhs && ConjugateRhs) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return vec_nmsub(a, b, c); + } + else { + return vec_madd(a, b, c); + } +} + +/** \internal madd for complex times real */ +template +EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) +{ + if (Conjugate) { + return vec_madd(a, pconj2(ComplexPacket(b)).v, c); + } + else { + return vec_madd(a, b, c); + } +} + +template +EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + conj_helper pcj; + RhsPacket b0; + if (StorageOrder == ColMajor) { + b0 = pset1(*b); + } + else { + b0 = ploadu(b); + } + c0 = pcj.pmadd(a0, b0, c0); +} + +/** \internal core multiply operation for vectors - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) +{ + ScalarPacket br, bi; + if (StorageOrder == ColMajor) { + pload_realimag(b, br, bi); + } + else { + pload_realimag_row(b, br, bi); + } + if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0); + LhsPacket a1 = pcplxflipconj(a0); + ScalarPacket cr = pmadd_complex_complex(a0.v, br, c0.v); + ScalarPacket ci = pmadd_complex_complex(a1.v, bi, c1.v); + c1 = ResPacket(ci); + c0 = PResPacket(cr); +} + +/** \internal core multiply operation for vectors - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + ScalarPacket cri = pmadd_complex_real(a0, b0, c0.v); + c0 = PResPacket(cri); +} + +/** \internal core multiply operation for vectors - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) +{ + ScalarPacket a1 = pload_complex(&a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + ScalarPacket cri = pmadd_complex_real(a1, b0, c0.v); + c0 = PResPacket(cri); +} + +#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) \ +{ \ + gemv_mult_complex_complex(a0, b, c0, c1); \ +} + +GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex, Packet2cf) +GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex, Packet1cd) + +#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) \ +{ \ + gemv_mult_real_complex(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX(float, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(double, std::complex, Packet1cd) +GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex, Packet2cf) +GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex, Packet1cd) + +#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) \ +{ \ + gemv_mult_complex_real(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, float, Packet2cf, std::complex) +GEMV_MULT_COMPLEX_REAL(std::complex, double, Packet1cd, std::complex) + +#ifdef USE_GEMV_MMA +/** \internal convert packet to real form */ +template +EIGEN_ALWAYS_INLINE T convertReal(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) +{ + return a.v; +} + +EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) +{ + return a.v; +} + +/** \internal convert packet to complex form */ +template +EIGEN_ALWAYS_INLINE T convertComplex(T a) +{ + return a; +} + +EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) +{ + return Packet2cf(a); +} + +EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) +{ + return Packet1cd(a); +} + +/** \internal load a vector from a complex location (for MMA version) */ +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) +{ + a = SLhsPacket(pload_complex(&a)); +} + +template +EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) +{ + // Pass thru +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b); + } +} + +/** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */ +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) +{ + if (NegativeAccumulate) + { + __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b); + } + else { + __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b); + } +} + +template +EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) +{ + // Just for compilation +} + +/** \internal madd for complex times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, b2, a.v); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, b, a.v); + } + else { + return pger_vecMMA(c, b, a.v); + } +} + +template +EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (ConjugateLhs && ConjugateRhs) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else if (Negate && !ConjugateLhs && ConjugateRhs) { + return pger_vecMMA(c, a, b); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal madd for complex times real (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) +{ + RealPacket a2 = convertReal(a); + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b2, a2); + } else { + return pger_vecMMA(c, a2, b2); + } + } + else { + if (StorageOrder == ColMajor) { + return pger_vecMMA(c, b, a2); + } else { + return pger_vecMMA(c, a2, b); + } + } +} + +/** \internal madd for real times complex (MMA version) */ +template +EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) +{ + if (Conjugate) { + RealPacket b2 = pconj2(convertComplex(b)).v; + return pger_vecMMA(c, a, b2); + } + else { + return pger_vecMMA(c, a, b); + } +} + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_realimag_combine(b); + } else { + b0 = pload_realimag_combine_row(b); + } + pmadd_complex_complex_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - complex times real */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + pload_complex_MMA(a0); + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_real(b); + } + else { + b0 = pload_real_row(b); + } + pmadd_complex_real_MMA(a0, b0, c0); +} + +/** \internal core multiply operation for vectors (MMA version) - real times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) +{ + ScalarPacket b0; + if (StorageOrder == ColMajor) { + b0 = pload_complex_full(b); + } + else { + b0 = pload_complex_full_row(b); + } + pmadd_complex_real_MMA)) ? StorageOrder : ColMajor>(a0, b0, c0); +} + +#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex) +GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex) + +/** \internal core multiply operation for vectors (MMA version) - complex times complex */ +template +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex* b, __vector_quad* c0) +{ + if (sizeof(LhsScalar) == 16) { + gemv_mult_complex_complex_MMA(a0, b, c0); + } + else { + gemv_mult_real_complex_MMA(a0, b, c0); + } +} + +#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_real_complex_MMA(a0, b, c0); \ +} + +GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex) +GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex) + +#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType) \ +template \ +EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) \ +{ \ + gemv_mult_complex_real_MMA(a0, b, c0); \ +} + +GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float) +GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float) +GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double) + +/** \internal disassemble MMA accumulator results into packets */ +template +EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (sizeof(LhsPacket) == 16) { + if (sizeof(RhsPacket) == 16) { + ScalarPacket tmp0, tmp2; + tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]); + tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]); + result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]); + result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]); + result0.packet[2] = tmp2; + result0.packet[0] = tmp0; + + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } else { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]); + } else { + result0.packet[0][1] = result0.packet[1][1]; + result0.packet[2][1] = result0.packet[3][1]; + } + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock& result0) +{ + __builtin_mma_disassemble_acc(&result0.packet, c0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else { + if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else if (sizeof(LhsPacket) == sizeof(std::complex)) { + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } else { + result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]); + } +} + +template +EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock& result0) +{ + if (!GEMV_IS_COMPLEX_FLOAT) { + disassembleResults2(c0, result0); + } else { + disassembleResults4(c0, result0); + } +} +#endif + +#define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1) + +#define GEMV_LOADPACKET_COL_COMPLEX(iter) \ + loadLhsPacket(lhs, i + ((iter) * ResPacketSize), j) + +#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); +#else +#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2) \ + if (sizeof(LhsPacket) == 16) { \ + const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j); \ + a##iter1 = *reinterpret_cast<__vector_pair *>(const_cast(&src)); \ + EIGEN_UNUSED_VARIABLE(f##iter1); \ + } else { \ + f##iter1 = lhs.template load(i + ((iter2) * ResPacketSize), j); \ + GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \ + } +#endif + +#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + } else { \ + GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter); \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + gemv_mult_complex_MMA(f##iter, b, &e0##iter); \ + } else { \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3) \ + } else { \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1); \ + GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(a##iter2); \ + EIGEN_UNUSED_VARIABLE(a##iter3); \ + } \ + EIGEN_UNUSED_VARIABLE(f##iter2); \ + EIGEN_UNUSED_VARIABLE(f##iter3); + +#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket g[2]; \ + __builtin_vsx_disassemble_pair(reinterpret_cast(g), &a##iter2); \ + gemv_mult_complex_MMA(g[0], b, &e0##iter2); \ + gemv_mult_complex_MMA(g[1], b, &e0##iter3); \ + } else { \ + gemv_mult_complex_MMA(a##iter2, b, &e0##iter2); \ + gemv_mult_complex_MMA(a##iter3, b, &e0##iter3); \ + } \ + } + +#if EIGEN_COMP_LLVM +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) \ + } + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + if (GEMV_GETN_COMPLEX(N) > 1) { \ + GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \ + } else { \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) \ + } +#else +#define GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N) + +#define GEMV_WORK_COL_COMPLEX_MMA(N) \ + GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N) +#endif + +#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter) \ + disassembleResults(&e0##iter, result0##iter); + +#define GEMV_STORE_COL_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter); \ + c0##iter = PResPacket(result0##iter.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } else { \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize)); \ + c0##iter = PResPacket(result0##iter.packet[2]); \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize)); \ + } \ + } + +#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (GEMV_GETN_COMPLEX(N) > iter1) { \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter2); \ + GEMV_DISASSEMBLE_COMPLEX_MMA(iter3); \ + c0##iter2 = PResPacket(result0##iter2.packet[0]); \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + c0##iter3 = PResPacket(result0##iter3.packet[0]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } else { \ + c0##iter3 = PResPacket(result0##iter2.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + c0##iter2 = PResPacket(result0##iter3.packet[0]); \ + c0##iter3 = PResPacket(result0##iter3.packet[2]); \ + pstoreu_pmadd_complex(c0##iter2, c0##iter3, alpha_data, res + i); \ + } \ + } + +#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_LOAD_COL_COMPLEX_MMA(N) \ + GEMV_WORK_COL_COMPLEX_MMA(N) \ + } while (++j < jend); \ + if (GEMV_GETN(N) <= 2) { \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N) \ + } else { \ + GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \ + } \ + i += (ResPacketSize * N); +#endif + +#define GEMV_INIT_COMPLEX(iter, N) \ + if (N > iter) { \ + c0##iter = pset_zero(); \ + c1##iter = pset_init(c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter); \ + gemv_mult_complex(f##iter, b, c0##iter, c1##iter); \ + } else { \ + EIGEN_UNUSED_VARIABLE(f##iter); \ + } + +#define GEMV_STORE_COL_COMPLEX(iter, N) \ + if (N > iter) { \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + pstoreu_pmadd_complex(c0##iter, alpha_data, res + i + (iter * ResPacketSize)); \ + } + +/** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */ +#define GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + GEMV_UNROLL(GEMV_INIT_COMPLEX, N) \ + Index j = j2; \ + do { \ + const RhsScalar& b1 = rhs2(j, 0); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL(GEMV_PREFETCH, N) \ + GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N) \ + } while (++j < jend); \ + GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N) \ + i += (ResPacketSize * N); + +#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA)) +#define USE_GEMV_COL_COMPLEX_MMA +#endif + +#ifdef USE_GEMV_COL_COMPLEX_MMA +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) +#else +#if defined(USE_GEMV_MMA) && (__GNUC__ > 10) +#define GEMV_PROCESS_COL_COMPLEX(N) \ + if (sizeof(Scalar) != sizeof(LhsPacket)) { \ + GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N) \ + } else { \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) \ + } +#else +#define GEMV_PROCESS_COL_COMPLEX(N) \ + GEMV_PROCESS_COL_COMPLEX_ONE(N) +#endif +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_col( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + EIGEN_UNUSED_VARIABLE(resIncr); + eigen_internal_assert(resIncr == 1); + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + RhsMapper rhs2(rhs); + + conj_helper cj; + + const Index lhsStride = lhs.stride(); + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; +#ifdef EIGEN_POWER_USE_GEMV_PREFETCH + const Index prefetch_dist = 64 * LhsPacketSize; +#endif + +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = rows - 8 * ResPacketSize + 1; + const Index n4 = rows - 4 * ResPacketSize + 1; + const Index n2 = rows - 2 * ResPacketSize + 1; +#endif + const Index n1 = rows - 1 * ResPacketSize + 1; + + // TODO: improve the following heuristic: + const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8); + + typedef alpha_store AlphaData; + AlphaData alpha_data(alpha); + + for (Index j2 = 0; j2 < cols; j2 += block_cols) + { + Index jend = numext::mini(j2 + block_cols, cols); + Index i = 0; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; + PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + __vector_pair a0, a1, a2, a3, a4, a5, a6, a7; + PacketBlock result00, result01, result02, result03, result04, result05, result06, result07; + GEMV_UNUSED(8, e0) + GEMV_UNUSED(8, result0) + GEMV_UNUSED(8, a) + GEMV_UNUSED(8, f) +#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA) + if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT) +#endif +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + { + while (i < n8) + { + GEMV_PROCESS_COL_COMPLEX(8) + } + } + while (i < n4) + { + GEMV_PROCESS_COL_COMPLEX(4) + } + if (i < n2) + { + GEMV_PROCESS_COL_COMPLEX(2) + } + if (i < n1) +#else + while (i < n1) +#endif + { + GEMV_PROCESS_COL_COMPLEX_ONE(1) + } + for (;i < rows;++i) + { + ResScalar d0(0); + Index j = j2; + do { + d0 += cj.pmul(lhs(i, j), rhs2(j, 0)); + } while (++j < jend); + res[i] += alpha * d0; + } + } +} + +template struct ScalarBlock { + Scalar scalar[N]; +}; + +#ifdef USE_GEMV_MMA +static Packet16uc p16uc_ELEMENT_3 = { 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f, 0x0c,0x0d,0x0e,0x0f, 0x1c,0x1d,0x1e,0x1f }; + +/** \internal predux (add elements of a vector) from a MMA accumulator - real results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]); + result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]); + result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]); + result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3); + result0.packet[0] = vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template<> +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +/** \internal add complex results together */ +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock& result0, PacketBlock& result1) +{ + ScalarBlock, 2> cc0; + result0.packet[0] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[0]), reinterpret_cast(result1.packet[0]))); + result0.packet[2] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[2]), reinterpret_cast(result1.packet[2]))); + result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]); + if (GEMV_IS_COMPLEX_COMPLEX) { + result0.packet[1] = reinterpret_cast(vec_mergeh(reinterpret_cast(result0.packet[1]), reinterpret_cast(result1.packet[1]))); + result0.packet[3] = reinterpret_cast(vec_mergel(reinterpret_cast(result0.packet[3]), reinterpret_cast(result1.packet[3]))); + result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]); + if (ConjugateLhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v; + } else if (ConjugateRhs) { + result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v; + } else { + result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v; + } + result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]); + } else { + if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex))) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + } + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[0][2]); + cc0.scalar[1].imag(result0.packet[0][3]); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock, 2> addComplexResults(PacketBlock&, PacketBlock&) +{ + ScalarBlock, 2> cc0; + EIGEN_UNUSED_VARIABLE(cc0); + return cc0; // Just for compilation +} + +/** \internal predux (add elements of a vector) from a MMA accumulator - complex results */ +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0, __vector_quad* acc1) +{ + PacketBlock result0, result1; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + __builtin_mma_disassemble_acc(&result1.packet, acc1); + return addComplexResults(result0, result1); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(__vector_quad* acc0) +{ + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + result0.packet[0] = vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3])); + return *reinterpret_cast *>(&result0.packet[0]); +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(__vector_quad* acc0) +{ + ScalarBlock cc0; + PacketBlock result0; + __builtin_mma_disassemble_acc(&result0.packet, acc0); + if (GEMV_IS_COMPLEX_COMPLEX) { + if (ConjugateLhs) { + result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v; + } else if (ConjugateRhs) { + result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v; + result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v; + } else { + result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v; + result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v; + } + result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2)); + result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2)); + } else { + result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1); + result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1); + } + cc0.scalar[0].real(result0.packet[0][0]); + cc0.scalar[0].imag(result0.packet[0][1]); + cc0.scalar[1].real(result0.packet[2][0]); + cc0.scalar[1].imag(result0.packet[2][1]); + return cc0; +} +#endif + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_real(ResPacket& a, ResPacket& b) +{ + ScalarBlock cc0; + cc0.scalar[0] = predux(a); + cc0.scalar[1] = predux(b); + return cc0; +} + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(ResPacket& a, ResPacket& b) +{ + return predux_real(a, b); +} + +#define GEMV_UNROLL_ROW(func, N) \ + func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N) + +#define GEMV_UNROLL_ROW_HALF(func, N) \ + func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N) + +#define GEMV_LOADPACKET_ROW(iter) \ + lhs.template load(i + (iter), j) + +#ifdef USE_GEMV_MMA +#define GEMV_UNROLL3_ROW(func, N, which) \ + func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) \ + func(4, N, which) func(5, N, which) func(6, N, which) func(7, N, which) + +#define GEMV_UNUSED_ROW(N, which) \ + GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which) + +#define GEMV_INIT_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + __builtin_mma_xxsetaccz(&c##iter); \ + } + +#define GEMV_LOADPAIR_ROW(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1)); + +#define GEMV_WORK_ROW(iter, N) \ + if (GEMV_GETN(N) > iter) { \ + if (GEMV_IS_FLOAT) { \ + pger_vecMMA_acc(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \ + } else { \ + __vector_pair b##iter; \ + GEMV_LOADPAIR_ROW(iter, iter << 1) \ + pger_vecMMA_acc(&c##iter, b##iter, a0); \ + } \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_FLOAT) { \ + cc##iter1 = predux_real(&c##iter2, &c##iter3); \ + } else { \ + cc##iter1 = predux_real(&c##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#else +#define GEMV_INIT_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pset1(ResScalar(0)); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c##iter); \ + } + +#define GEMV_WORK_ROW(iter, N) \ + if (N > iter) { \ + c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \ + } + +#define GEMV_PREDUX2(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_real(c##iter2, c##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } +#endif + +#define GEMV_MULT(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \ + } + +#define GEMV_STORE_ROW(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +/** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW(N) \ + for (; i < n##N; i += N) { \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW, N) \ + Index j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket a0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW, N) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1)) \ + for (; j < cols; ++j) { \ + RhsScalar a0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1)) \ + } + +template +EIGEN_STRONG_INLINE void gemv_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; + conj_helper pcj; + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = Traits::ResPacketSize, + LhsPacketSize = Traits::LhsPacketSize, + RhsPacketSize = Traits::RhsPacketSize, + }; + + Index i = 0; +#ifdef USE_GEMV_MMA + __vector_quad c0, c1, c2, c3, c4, c5, c6, c7; + GEMV_UNUSED_ROW(8, c) +#else + ResPacket c0, c1, c2, c3, c4, c5, c6, c7; +#endif +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; + GEMV_PROCESS_ROW(8) + GEMV_PROCESS_ROW(4) + GEMV_PROCESS_ROW(2) +#endif + for (; i < rows; ++i) + { + ResPacket d0 = pset1(ResScalar(0)); + Index j = 0; + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) + { + RhsPacket b0 = rhs2.template load(j); + + d0 = pcj.pmadd(lhs.template load(i + 0, j), b0, d0); + } + ResScalar dd0 = predux(d0); + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(float) +EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double) + +template +EIGEN_ALWAYS_INLINE ScalarBlock predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1, ResPacket& b1) +{ + if (GEMV_IS_COMPLEX_COMPLEX) { + a0 = padd(a0, a1); + b0 = padd(b0, b1); + } + return predux_complex(a0, b0); +} + +#define GEMV_LOADPACKET_ROW_COMPLEX(iter) \ + loadLhsPacket(lhs, i + (iter), j) + +#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) \ + convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter)) + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + const RhsScalar& b1 = rhs2(j); \ + RhsScalar* b = const_cast(&b1); \ + GEMV_UNROLL_ROW(which, N) \ + } + +#define GEMV_PROCESS_END_ROW_COMPLEX(N) \ + for (; j < cols; ++j) { \ + RhsScalar b0 = rhs2(j); \ + GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \ + } \ + GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1)) + +#ifdef USE_GEMV_MMA +#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + __builtin_mma_xxsetaccz(&e0##iter); \ + } + +#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \ + GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1)); + +#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N) \ + if (GEMV_GETN_COMPLEX(N) > iter) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } else { \ + __vector_pair a##iter; \ + GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1) \ + gemv_mult_complex_MMA(a##iter, b, &e0##iter); \ + } \ + } + +#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + if (GEMV_IS_COMPLEX_FLOAT) { \ + cc##iter1 = predux_complex(&e0##iter2, &e0##iter3); \ + } else { \ + cc##iter1 = predux_complex(&e0##iter1); \ + } \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } +#endif + +#define GEMV_WORK_ROW_COMPLEX(iter, N) \ + if (N > iter) { \ + PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter); \ + gemv_mult_complex(a##iter, b, c0##iter, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1 = predux_complex(c0##iter2, c0##iter3, c1##iter2, c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \ + cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \ + } + +#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + storeMaddData(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \ + storeMaddData(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N) + +/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store results */ +#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + if (GEMV_IS_COMPLEX_COMPLEX) { \ + c0##iter = padd(c0##iter, c1##iter); \ + } \ + dd0 = predux(c0##iter); + +#if EIGEN_COMP_LLVM +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) +#else +// gcc seems to be reading and writing registers unnecessarily to memory. +// Use the old way for complex double until it is fixed. + +#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) \ + lhs.template load(i + (iter), j) + +#define GEMV_INIT_COMPLEX_OLD(iter, N) \ + EIGEN_UNUSED_VARIABLE(c0##iter); \ + if (N > iter) { \ + c1##iter = pset_zero(); \ + } else { \ + EIGEN_UNUSED_VARIABLE(c1##iter); \ + } + +#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N) \ + if (N > iter) { \ + LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \ + c1##iter = pcj.pmadd(a##iter, b0, c1##iter); \ + } + +#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \ + if (N > iter1) { \ + cc##iter1.scalar[0] = predux(c1##iter2); \ + cc##iter1.scalar[1] = predux(c1##iter3); \ + } else { \ + EIGEN_UNUSED_VARIABLE(cc##iter1); \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N) \ + j = 0; \ + for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \ + RhsPacket b0 = rhs2.template load(j); \ + GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + for (; i < n##N; i += N) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \ + GEMV_PROCESS_END_ROW_COMPLEX(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + dd0 = predux(c1##iter); + +#if (__GNUC__ > 10) +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1 +#else +#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW \ + (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX +#endif + +#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N) \ + } + +#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \ + if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \ + } else { \ + GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \ + } +#endif + +#ifdef USE_GEMV_MMA +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N) +#else +#define GEMV_PROCESS_ROW_COMPLEX(N) \ + GEMV_PROCESS_ROW_COMPLEX_ONE(N) +#endif + +template +EIGEN_STRONG_INLINE void gemv_complex_row( + Index rows, Index cols, + const LhsMapper& alhs, + const RhsMapper& rhs, + ResScalar* res, Index resIncr, + ResScalar alpha) +{ + typedef gemv_traits Traits; + + typedef typename Traits::LhsPacket LhsPacket; + typedef typename Traits::RhsPacket RhsPacket; + typedef typename Traits::ResPacket ResPacket; + + typedef typename packet_traits::type ScalarPacket; + typedef typename packet_traits::type PLhsPacket; + typedef typename packet_traits::type PResPacket; + typedef gemv_traits PTraits; + + // The following copy tells the compiler that lhs's attributes are not modified outside this function + // This helps GCC to generate proper code. + LhsMapper lhs(alhs); + typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0); + + eigen_internal_assert(rhs.stride() == 1); + conj_helper cj; +#if !EIGEN_COMP_LLVM + conj_helper pcj; +#endif + + // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, + // processing 8 rows at once might be counter productive wrt cache. +#ifndef GCC_ONE_VECTORPAIR_BUG + const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7); + const Index n4 = rows - 3; + const Index n2 = rows - 1; +#endif + + // TODO: for padded aligned inputs, we could enable aligned reads + enum { + LhsAlignment = Unaligned, + ResPacketSize = PTraits::ResPacketSize, + LhsPacketSize = PTraits::LhsPacketSize, + RhsPacketSize = PTraits::RhsPacketSize, + }; + + Index i = 0, j; + PResPacket c00, c01, c02, c03, c04, c05, c06, c07; + ResPacket c10, c11, c12, c13, c14, c15, c16, c17; +#ifdef USE_GEMV_MMA + __vector_quad e00, e01, e02, e03, e04, e05, e06, e07; + GEMV_UNUSED_ROW(8, e0) + GEMV_UNUSED_EXTRA(1, c0) + GEMV_UNUSED_EXTRA(1, c1) +#endif + ResScalar dd0; +#ifndef GCC_ONE_VECTORPAIR_BUG + ScalarBlock cc0, cc1, cc2, cc3; +#ifdef USE_GEMV_MMA + if (!GEMV_IS_COMPLEX_COMPLEX) +#endif + { + GEMV_PROCESS_ROW_COMPLEX(8) + } + GEMV_PROCESS_ROW_COMPLEX(4) + GEMV_PROCESS_ROW_COMPLEX(2) +#endif + for (; i < rows; ++i) + { + GEMV_PROCESS_ROW_COMPLEX_SINGLE(1) + GEMV_PROCESS_ROW_COMPLEX_PREDUX(0) + for (; j < cols; ++j) + { + dd0 += cj.pmul(lhs(i, j), rhs2(j)); + } + res[i * resIncr] += alpha * dd0; + } +} + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_col(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar) \ +template \ +struct general_matrix_vector_product \ +{ \ + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; \ +\ + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( \ + Index rows, Index cols, \ + const LhsMapper& lhs, \ + const RhsMapper& rhs, \ + ResScalar* res, Index resIncr, \ + ResScalar alpha) { \ + gemv_complex_row(rows, cols, lhs, rhs, res, resIncr, alpha); \ + } \ +}; + +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, float) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, double) +EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex, std::complex) + +#endif // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H + diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 2a44054..5c1abe7 100644 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -84,7 +84,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} -#ifndef __VSX__ +#ifndef EIGEN_VECTORIZE_VSX static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif @@ -114,7 +114,7 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 // Define global static constants: #ifdef _BIG_ENDIAN static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #endif static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; @@ -168,13 +168,16 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX HasSqrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, #else HasRsqrt = 0, #endif + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH, + HasRint = 1, #else HasSqrt = 0, HasRsqrt = 0, @@ -184,7 +187,6 @@ struct packet_traits : default_packet_traits { HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, HasNegate = 1, HasBlend = 1 }; @@ -210,23 +212,24 @@ struct packet_traits : default_packet_traits { HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX HasSqrt = 1, #if !EIGEN_COMP_CLANG HasRsqrt = 1, #else HasRsqrt = 0, #endif + HasRint = 1, #else HasSqrt = 0, HasRsqrt = 0, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, + HasRint = 0, #endif + HasTanh = 0, + HasErf = 0, HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, HasNegate = 1, HasBlend = 1 }; @@ -432,7 +435,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) // ignoring these warnings for now. EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); @@ -481,7 +484,7 @@ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet // ignoring these warnings for now. EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX vec_xst(from, 0, to); #else vec_st(from, 0, to); @@ -786,8 +789,22 @@ template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p4f_MZERO); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p4i_ZERO - a; +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -802,7 +819,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { -#ifndef __VSX__ // VSX actually provides a div instruction +#ifndef EIGEN_VECTORIZE_VSX // VSX actually provides a div instruction Packet4f t, y_0, y_1; // Altivec does not offer a divide instruction, we have to do a reciprocal approximation @@ -831,7 +848,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8 template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { - #ifdef __VSX__ + #ifdef EIGEN_VECTORIZE_VSX // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); @@ -849,7 +866,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { - #ifdef __VSX__ + #ifdef EIGEN_VECTORIZE_VSX // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); @@ -865,26 +882,39 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, con template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +// To fix bug with vec_cmplt on older versions +#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { Packet4f c = reinterpret_cast(vec_cmpge(a,b)); return vec_nor(c,c); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } @@ -923,7 +953,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); Packet4f res; -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX __asm__("xvrspiz %x0, %x1\n\t" : "=&wa" (res) : "wa" (t)); @@ -937,6 +967,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { Packet4f res; @@ -947,21 +978,19 @@ template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) return res; } +#endif template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) { - EIGEN_DEBUG_ALIGNED_LOAD -#ifdef _BIG_ENDIAN - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - //TODO: Add static_cast here - return (Packet) vec_perm(MSQ, LSQ, mask); // align the data -#else EIGEN_DEBUG_UNALIGNED_LOAD +#ifdef EIGEN_VECTORIZE_VSX return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); +#else + Packet16uc mask = vec_lvsl(0, from); // create the permute mask + Packet16uc MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword + Packet16uc LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword + //TODO: Add static_cast here + return (Packet) vec_perm(MSQ, LSQ, mask); // align the data #endif } @@ -1066,7 +1095,9 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const unsigned ch template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) { EIGEN_DEBUG_UNALIGNED_STORE -#ifdef _BIG_ENDIAN +#ifdef EIGEN_VECTORIZE_VSX + vec_xst(from, 0, to); +#else // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html // Warning: not thread safe! Packet16uc MSQ, LSQ, edges; @@ -1081,8 +1112,6 @@ template EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE_ LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part second -#else - vec_xst(from, 0, to); #endif } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) @@ -1341,16 +1370,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, con BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); } -template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_generic(a,exponent); } @@ -1390,9 +1409,11 @@ template<> EIGEN_STRONG_INLINE Packet8bf pceil (const Packet8bf& a){ template<> EIGEN_STRONG_INLINE Packet8bf pround (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(pround, a); } +#ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet8bf print (const Packet8bf& a){ BF16_TO_F32_UNARY_OP_WRAPPER(print, a); } +#endif template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { Packet4f a_even = Bf16ToF32Even(a); Packet4f a_odd = Bf16ToF32Odd(a); @@ -2252,7 +2273,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Pa //---------- double ---------- -#ifdef __VSX__ +#ifdef EIGEN_VECTORIZE_VSX typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; @@ -2304,7 +2325,11 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 1, HasSqrt = 1, +#if !EIGEN_COMP_CLANG HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -2393,7 +2418,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p2d_MZERO); +#endif +} template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -2703,7 +2735,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons } -#endif // __VSX__ +#endif // EIGEN_VECTORIZE_VSX } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index deb4c86..45f6ddb 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -11,13 +11,24 @@ #ifndef EIGEN_COMPLEX_CUDA_H #define EIGEN_COMPLEX_CUDA_H -// clang-format off // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, GCC and older versions of clang do // not treat them as device functions and thus Eigen functors making use of // these operators fail to compile. Here, we manually specialize these // operators and functors for complex types when building for CUDA to enable // their use on-device. +// +// NOTES: +// - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device, +// since they are already specialized in the standard. Using them will result +// in silent kernel failures. +// - Compiling with MSVC and using +=,-=,*=,/=(std::complex) will lead +// to duplicate definition errors, since these are already specialized in +// Visual Studio's header (contrary to the standard). This is +// preferable to removing such definitions, which will lead to silent kernel +// failures. +// - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior +// to the first inclusion of . #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index 1c28f4f..f21d1a0 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h @@ -251,12 +251,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; return output; } - const uint16_t* p = reinterpret_cast(&v); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - output.value = p[0]; -#else - output.value = p[1]; -#endif + output.value = static_cast(numext::bit_cast(v) >> 16); return output; } @@ -462,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(&result); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - q[0] = h.value; -#else - q[1] = h.value; -#endif - return result; + return numext::bit_cast(static_cast(h.value) << 16); } // --- standard functions --- diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index c9fbaf6..a76ea0f 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -642,10 +642,10 @@ Packet psincos_float(const Packet& _x) PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" - #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) - // This version requires true FMA for high accuracy + #if defined(EIGEN_VECTORIZE_FMA) + // This version requires true FMA for high accuracy. // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; x = pmadd(y, pset1(-1.57079601287841796875f), x); @@ -757,6 +757,26 @@ Packet pcos_float(const Packet& x) return psincos_float(x); } +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED Packet pdiv_complex(const Packet& x, const Packet& y) { + typedef typename unpacket_traits::as_real RealPacket; + // In the following we annotate the code for the case where the inputs + // are a pair length-2 SIMD vectors representing a single pair of complex + // numbers x = a + i*b, y = c + i*d. + const RealPacket y_abs = pabs(y.v); // |c|, |d| + const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c| + const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|) + const RealPacket y_scaled = pdiv(y.v, y_max); // c / max(|c|, |d|), d / max(|c|, |d|) + // Compute scaled denominator. + const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2 + const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v); + Packet result_scaled = pmul(x, pconj(Packet(y_scaled))); // a * c' + b * d', -a * d + b * c + // Divide elementwise by denom. + result_scaled = Packet(pdiv(result_scaled.v, denom)); + // Rescale result + return Packet(pdiv(result_scaled.v, y_max)); +} template EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -895,7 +915,7 @@ void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { s_lo = psub(y, t); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#ifdef EIGEN_VECTORIZE_FMA // This function implements the extended precision product of // a pair of floating point numbers. Given {x, y}, it computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and @@ -946,7 +966,7 @@ void twoprod(const Packet& x, const Packet& y, p_lo = pmadd(x_lo, y_lo, p_lo); } -#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition @@ -1443,39 +1463,40 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { } // Generic implementation of pow(x,y). -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet generic_pow(const Packet& x, const Packet& y) { +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) { typedef typename unpacket_traits::type Scalar; const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_neg_inf = pset1(-NumTraits::infinity()); const Packet cst_zero = pset1(Scalar(0)); const Packet cst_one = pset1(Scalar(1)); const Packet cst_nan = pset1(NumTraits::quiet_NaN()); const Packet abs_x = pabs(x); // Predicates for sign and magnitude of x. - const Packet x_is_zero = pcmp_eq(x, cst_zero); - const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero); + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero); const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); - const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); - const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); - const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); // Predicates for sign and magnitude of y. + const Packet abs_y = pabs(y); const Packet y_is_one = pcmp_eq(y, cst_one); - const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero); const Packet y_is_neg = pcmp_lt(y, cst_zero); - const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg)); const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); - const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf); EIGEN_CONSTEXPR Scalar huge_exponent = - (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / - NumTraits::epsilon(); + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits::epsilon(); const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); // Predicates for whether y is integer and/or even. @@ -1484,39 +1505,31 @@ Packet generic_pow(const Packet& x, const Packet& y) { const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); // Predicates encoding special cases for the value of pow(x,y) - const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), - y_is_int), - abs_y_is_inf); - const Packet pow_is_one = por(por(x_is_one, y_is_zero), - pand(x_is_neg_one, - por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf); const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); - const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), - pand(abs_x_is_inf, y_is_neg)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_pos)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_neg)); - const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), - pand(abs_x_is_inf, y_is_pos)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_neg)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_pos)); + const Packet pow_is_one = + por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos)); + const Packet inf_val = + pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even), + cst_neg_inf, cst_pos_inf); // General computation of pow(x,y) for positive x or negative x and integer y. const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); const Packet pow_abs = generic_pow_impl(abs_x, y); - return pselect(y_is_one, x, - pselect(pow_is_one, cst_one, - pselect(pow_is_nan, cst_nan, - pselect(pow_is_inf, cst_pos_inf, - pselect(pow_is_zero, cst_zero, - pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); + return pselect( + y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, inf_val, + pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); } - - /* polevl (modified for Eigen) * * Evaluate polynomial diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 177a04e..730cc73 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -101,6 +101,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet psqrt_complex(const Packet& a); +/** \internal \returns x / y for complex types */ +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +EIGEN_UNUSED +Packet pdiv_complex(const Packet& x, const Packet& y); + template struct ppolevl; diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 9f8e8cc..6e2b31f 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -36,8 +36,6 @@ #ifndef EIGEN_HALF_H #define EIGEN_HALF_H -#include - #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as // some other routines are defined in the GPU compiler header files @@ -334,7 +332,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { } #endif -#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); } @@ -534,7 +532,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #elif defined(EIGEN_HAS_FP16_C) __half_raw h; - h.x = _cvtss_sh(ff, 0); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0); + #else + h.x = _cvtss_sh(ff, 0); + #endif return h; #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) @@ -595,7 +598,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x))); + #else + return _cvtsh_ss(h.x); + #endif #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return static_cast(h.x); #else diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 689110d..bfc11ef 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -121,7 +121,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) { @@ -180,8 +179,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); } -} // namespace - template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, const float4& b) { @@ -493,9 +490,10 @@ ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +// Half-packet functions are not available on the host for CUDA 9.0-9.2, only +// on device. There is no benefit to using them on the host anyways, since they are +// emulated. +#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; @@ -526,42 +524,9 @@ template<> struct packet_traits : default_packet_traits }; }; -namespace { -// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __halves2half2(a, b); -#else - // Round-about way since __halves2half2 is a __device__ function. - return __floats2half2_rn(__half2float(a), __half2float(b)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __low2half(a); -#else - return __float2half(__low2float(a)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __high2half(a); -#else - return __float2half(__high2float(a)); -#endif -} -} // namespace - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2half2(from); -#else - const float f = __half2float(from); - return __floats2half2_rn(f, f); -#endif } template <> @@ -576,8 +541,6 @@ pset1(const Eigen::half& from) { return r; } -// We now need this visible on both host and device. -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -585,11 +548,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return combine_half(from[0], from[1]); + return __halves2half2(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return combine_half(from[0], from[0]); + return __halves2half2(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -599,8 +562,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = get_half2_low(from); - to[1] = get_half2_high(from); + to[0] = __low2half(from); + to[1] = __high2half(from); } @@ -610,7 +573,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } @@ -619,31 +582,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( #if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return combine_half(from[0*stride], from[1*stride]); + return __halves2half2(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = get_half2_low(from); - to[stride*1] = get_half2_high(from); + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return get_half2_low(a); + return __low2half(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); + half a1 = __low2half(a); + half a2 = __high2half(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { @@ -658,12 +621,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = get_half2_low(kernel.packet[0]); - __half a2 = get_half2_high(kernel.packet[0]); - __half b1 = get_half2_low(kernel.packet[1]); - __half b2 = get_half2_high(kernel.packet[1]); - kernel.packet[0] = combine_half(a1, b1); - kernel.packet[1] = combine_half(a2, b2); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { @@ -671,88 +634,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return combine_half(a, __float2half(f)); + return __halves2half2(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = get_half2_low(mask); - half mask_high = get_half2_high(mask); - half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); - half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); - return combine_half(result_low, result_high); + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, @@ -851,9 +814,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -862,9 +825,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { @@ -885,7 +848,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? get_half2_low(a) : get_half2_high(a); + return a1 > a2 ? __low2half(a) : __high2half(a); #endif } @@ -897,7 +860,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? get_half2_low(a) : get_half2_high(a); + return a1 < a2 ? __low2half(a) : __high2half(a); #endif } @@ -1068,10 +1031,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); - p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); - p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); - p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); + p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); + p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); + p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); return r; } @@ -1152,12 +1115,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = get_half2_low(f0); - __half a2 = get_half2_high(f0); - __half b1 = get_half2_low(f1); - __half b2 = get_half2_high(f1); - f0 = combine_half(a1, b1); - f1 = combine_half(a2, b2); + __half a1 = __low2half(f0); + __half a2 = __high2half(f0); + __half b1 = __low2half(f1); + __half b2 = __high2half(f1); + f0 = __halves2half2(a1, b1); + f1 = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1254,10 +1217,10 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(a, __float2half(f + 1.0f)); - p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); + p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif } @@ -1477,9 +1440,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_max(a_alias[0]), + half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = combine_half(predux_max(a_alias[2]), + half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1496,9 +1459,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_min(a_alias[0]), + half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = combine_half(predux_min(a_alias[2]), + half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1652,9 +1615,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } template<> @@ -1664,14 +1627,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } -// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) - -#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) #undef EIGEN_GPU_HAS_LDG #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 7545462..c8195bb 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -15,8 +15,7 @@ namespace Eigen { namespace internal { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 53dacfa..76e9f7c 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -75,15 +75,12 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; } - EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { - *this *= b.conjugate(); - Packet4f s = pmul(b.v, b.v); - s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - v = pdiv(v, s); - return *this; - } EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { - return Packet2cf(*this) /= b; + return pdiv_complex(Packet2cf(*this), b); + } + EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { + *this = Packet2cf(*this) / b; + return *this; } EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(pnegate(v)); diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f40af7f..6cfe867 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -129,12 +129,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) { - const Packet2ui b = vreinterpret_u32_f32(a.v); + const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v)); return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - const Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v)); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } @@ -347,27 +347,11 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { - // TODO optimize it for NEON - Packet1cf res = pmul(a, pconj(b)); - Packet2f s, rev_s; - - // this computes the norm - s = vmul_f32(b.v, b.v); - rev_s = vrev64_f32(s); - - return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for NEON - Packet2cf res = pmul(a,pconj(b)); - Packet4f s, rev_s; - - // this computes the norm - s = vmulq_f32(b.v, b.v); - rev_s = vrev64q_f32(s); - - return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); + return pdiv_complex(a, b); } EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} @@ -389,13 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt(const Packet2cf& a) { //---------- double ---------- #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG -// See bug 1325, clang fails to call vld1q_u64. -#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML - static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; -#else - const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; - static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA ); -#endif +inline uint64x2_t p2ul_CONJ_XOR() { + static const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000}; + return vld1q_u64(p2ul_conj_XOR_DATA); +} struct Packet1cd { @@ -465,7 +446,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) -{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { @@ -480,7 +461,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); // Conjugate v2 - v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR())); // Swap real/imag elements in v2. v2 = preverse(v2); // Add and return the result @@ -553,12 +534,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for NEON - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = pmul(b.v, b.v); - Packet2d rev_s = preverse(s); - - return Packet1cd(pdiv(res.v, padd(s,rev_s))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index 3481f33..0963b0f 100644 --- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -24,7 +24,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, - Packet4f& c, Packet4f& tmp, + Packet4f& c, Packet4f&, const LaneIdType&) const { acc(a, b, c); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d2aeef4..f24fdfb 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -57,6 +57,16 @@ typedef eigen_packet_wrapper Packet4ui; typedef eigen_packet_wrapper Packet2l; typedef eigen_packet_wrapper Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return vld1q_f32(from); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + float from[2] = {a, b}; + return vld1_f32(from); +} + #else typedef float32x2_t Packet2f; @@ -78,11 +88,22 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + const Packet2f low = {a, b}; + const Packet2f high = {c, d}; + return vcombine_f32(low, high); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + const Packet2f result = {a, b}; + return result; +} + #endif // EIGEN_COMP_MSVC_STRICT EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))); return res; } @@ -95,7 +116,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -104,7 +125,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -135,7 +156,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); } #define vec4f_duplane(a, p) \ - vdupq_lane_f32(vget_low_f32(a), p) + Packet4f(vdupq_lane_f32(vget_low_f32(a), p)) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -146,7 +167,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC // __builtin_prefetch tends to do nothing on ARM64 compilers because the // prefetch instructions there are too detailed for __builtin_prefetch to map // meaningfully to them. @@ -155,7 +176,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif EIGEN_ARCH_ARM32 +#elif EIGEN_ARCH_ARM #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching @@ -862,12 +883,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, con template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { - Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + Packet2f mask = make_packet2f(numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { - Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + Packet4f mask = make_packet4f(numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } @@ -947,57 +968,6 @@ template<> EIGEN_STRONG_INLINE Packet2ul pmul(const Packet2ul& a, con vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1))); } -template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdiv_f32(a,b); -#else - Packet2f inv, restep, div; - - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpe_f32(b); - - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecps_f32(b, inv); - inv = vmul_f32(restep, inv); - - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmul_f32(a, inv); - - return div; -#endif -} -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdivq_f32(a,b); -#else - Packet4f inv, restep, div; - - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpeq_f32(b); - - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecpsq_f32(b, inv); - inv = vmulq_f32(restep, inv); - - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmulq_f32(a, inv); - - return div; -#endif -} - template<> EIGEN_STRONG_INLINE Packet4c pdiv(const Packet4c& /*a*/, const Packet4c& /*b*/) { eigen_assert(false && "packet integer division are not supported by NEON"); @@ -1079,12 +1049,15 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, return pset1(0ULL); } - -#ifdef __ARM_FEATURE_FMA -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ return vfmaq_f32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) -{ return vfma_f32(c,a,b); } +#ifdef EIGEN_VECTORIZE_FMA +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return vfmaq_f32(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) { + return vfma_f32(c, a, b); +} #else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { @@ -2499,7 +2472,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2513,7 +2486,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) { uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2527,7 +2500,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) { const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); @@ -2563,11 +2536,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) -{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) @@ -3180,7 +3153,7 @@ template<> EIGEN_STRONG_INLINE Packet2f pceil(const Packet2f& a) return padd(tmp, mask); } -#endif +#endif // EIGEN_ARCH_ARMV8 /** * Computes the integer square root @@ -3273,40 +3246,115 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { return res; } -template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { +EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) { // Compute approximate reciprocal sqrt. - Packet4f x = vrsqrteq_f32(a); - // Do Newton iterations for 1/sqrt(x). - x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); - x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x); - const Packet4f infinity = pset1(NumTraits::infinity()); - return pselect(pcmp_eq(a, pzero(a)), infinity, x); + // Does not correctly handle +/- 0 or +inf + float32x4_t result = vrsqrteq_f32(a); + result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result); + result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result); + return result; +} + +EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) { + // Compute approximate reciprocal sqrt. + // Does not correctly handle +/- 0 or +inf + float32x2_t result = vrsqrte_f32(a); + result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result); + result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result); + return result; +} + +template Packet prsqrt_float_common(const Packet& a) { + const Packet cst_zero = pzero(a); + const Packet cst_inf = pset1(NumTraits::infinity()); + Packet return_zero = pcmp_eq(a, cst_inf); + Packet return_inf = pcmp_eq(a, cst_zero); + Packet result = prsqrt_float_unsafe(a); + result = pselect(return_inf, por(cst_inf, a), result); + result = pandnot(result, return_zero); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { + return prsqrt_float_common(a); } template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) { - // Compute approximate reciprocal sqrt. - Packet2f x = vrsqrte_f32(a); - // Do Newton iterations for 1/sqrt(x). - x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); - x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x); - const Packet2f infinity = pset1(NumTraits::infinity()); - return pselect(pcmp_eq(a, pzero(a)), infinity, x); + return prsqrt_float_common(a); +} + +EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a) +{ + // Compute approximate reciprocal. + float32x4_t result = vrecpeq_f32(a); + result = vmulq_f32(vrecpsq_f32(a, result), result); + result = vmulq_f32(vrecpsq_f32(a, result), result); + return result; +} + +EIGEN_STRONG_INLINE Packet2f preciprocal(const Packet2f& a) +{ + // Compute approximate reciprocal. + float32x2_t result = vrecpe_f32(a); + result = vmul_f32(vrecps_f32(a, result), result); + result = vmul_f32(vrecps_f32(a, result), result); + return result; } // Unfortunately vsqrt_f32 is only available for A64. #if EIGEN_ARCH_ARM64 -template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);} -template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); } +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); } #else -template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { - const Packet4f infinity = pset1(NumTraits::infinity()); - const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); - return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); +template +EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) { + const Packet cst_zero = pzero(a); + const Packet cst_inf = pset1(NumTraits::infinity()); + + Packet result = pmul(a, prsqrt_float_unsafe(a)); + Packet a_is_zero = pcmp_eq(a, cst_zero); + Packet a_is_inf = pcmp_eq(a, cst_inf); + Packet return_a = por(a_is_zero, a_is_inf); + + result = pselect(return_a, a, result); + return result; } + +template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { + return psqrt_float_common(a); +} + template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { - const Packet2f infinity = pset1(NumTraits::infinity()); - const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity)); - return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a))); + return psqrt_float_common(a); +} + +template +EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) { + // if b is large, NEON intrinsics will flush preciprocal(b) to zero + // avoid underflow with the following manipulation: + // a / b = f * (a * reciprocal(f * b)) + + const Packet cst_one = pset1(1.0f); + const Packet cst_quarter = pset1(0.25f); + const Packet cst_thresh = pset1(NumTraits::highest() / 4.0f); + + Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b)); + Packet f = pselect(b_will_underflow, cst_quarter, cst_one); + Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f)))); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { + return pdiv_float_common(a, b); +} + +template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { + return pdiv_float_common(a, b); } #endif @@ -3388,7 +3436,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { // See the scalar implemention in BFloat16.h for a comprehensible explanation // of this fast rounding algorithm - Packet4ui input = reinterpret_cast(p); + Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p)); // lsb = (input >> 16) & 1 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); @@ -3413,7 +3461,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) { - return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); + return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16))); } EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { @@ -3421,21 +3469,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { } template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { - return pset1(from.value); + return Packet4bf(pset1(from.value)); } template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { - return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(Packet4us(from)))); } template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { - return pload(reinterpret_cast(from)); + return Packet4bf(pload(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { - return ploadu(reinterpret_cast(from)); + return Packet4bf(ploadu(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) @@ -3450,7 +3498,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) { - return ploaddup(reinterpret_cast(from)); + return Packet4bf(ploaddup(reinterpret_cast(from))); } template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { @@ -3497,25 +3545,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) } template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { - return por(a, b); + return Packet4bf(por(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { - return pxor(a, b); + return Packet4bf(pxor(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { - return pand(a, b); + return Packet4bf(pand(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { - return pandnot(a, b); + return Packet4bf(pandnot(Packet4us(a), Packet4us(b))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) { - return pselect(mask, a, b); + return Packet4bf(pselect(Packet4us(mask), Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) @@ -3554,13 +3602,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, con template<> EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) { - return pgather(reinterpret_cast(from), stride); + return Packet4bf(pgather(reinterpret_cast(from), stride)); } template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) { - pscatter(reinterpret_cast(to), from, stride); + pscatter(reinterpret_cast(to), Packet4us(from), stride); } template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) @@ -3585,7 +3633,7 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) { - return preverse(a); + return Packet4bf(preverse(Packet4us(a))); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) @@ -3620,7 +3668,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) { - return pxor(a, pset1(static_cast(0x8000))); + return Packet4bf(pxor(Packet4us(a), pset1(static_cast(0x8000)))); } //---------- double ---------- @@ -3638,17 +3686,35 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +#if EIGEN_COMP_GNUC // Bug 907: workaround missing declarations of the following two functions in the ADK // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. +// This doesn't work with MSVC though, since the function names are macros. template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } - template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } +#endif +#if EIGEN_COMP_MSVC_STRICT +typedef eigen_packet_wrapper Packet2d; +typedef eigen_packet_wrapper Packet1d; + +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} + +#else typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} +#endif + // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) // Currently used in LU/arch/InverseSize4.h to enable a shared implementation // for fast inversion of matrices of size 4. @@ -3656,7 +3722,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m { const double* a = reinterpret_cast(&m); const double* b = reinterpret_cast(&n); - Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); return res; } @@ -3673,7 +3739,7 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) return shuffle(a, b, 3); } #define vec2d_duplane(a, p) \ - vdupq_laneq_f64(a, p) + Packet2d(vdupq_laneq_f64(a, p)) template<> struct packet_traits : default_packet_traits { @@ -3747,7 +3813,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ - const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + const Packet2d mask = make_packet2d(numext::bit_cast(0x8000000000000000ull), 0.0); return padd(a, pxor(mask, b)); } @@ -3759,7 +3825,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA // See bug 936. See above comment about FMA for float. template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } @@ -3862,7 +3928,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); } #endif // min @@ -3918,7 +3984,7 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } -#endif // EIGEN_ARCH_ARM64 +#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // Do we have an fp16 types and supporting Neon intrinsics? #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 54f9733..c546466 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -15,6 +15,113 @@ namespace Eigen { namespace internal { +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return Packet2f(vreinterpret_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return Packet2f(vreinterpret_f32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return Packet4f(vreinterpretq_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return Packet4f(vreinterpretq_f32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return Packet8c(vreinterpret_s8_u8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return Packet16c(vreinterpretq_s8_u8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return Packet8uc(vreinterpret_u8_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return Packet16uc(vreinterpretq_u8_s8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return Packet4s(vreinterpret_s16_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return Packet8s(vreinterpretq_s16_u16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return Packet4us(vreinterpret_u16_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return Packet8us(vreinterpretq_u16_s16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return Packet2i(vreinterpret_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return Packet2i(vreinterpret_s32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return Packet4i(vreinterpretq_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return Packet4i(vreinterpretq_s32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return Packet2ui(vreinterpret_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return Packet2ui(vreinterpret_u32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return Packet4ui(vreinterpretq_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return Packet4ui(vreinterpretq_u32_s32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return Packet2l(vreinterpretq_s64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return Packet2ul(vreinterpretq_u64_s64(a)); +} + //============================================================================== // pcast, SrcType = float //============================================================================== @@ -188,7 +295,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -212,11 +319,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -240,11 +347,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { - return vreinterpretq_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { - return vreinterpret_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -270,11 +377,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { - return vreinterpretq_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { - return vreinterpret_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { @@ -315,7 +422,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -339,11 +446,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -367,11 +474,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { - return vreinterpretq_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { - return vreinterpret_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -397,11 +504,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { - return vreinterpret_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { @@ -442,7 +549,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -466,11 +573,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -492,11 +599,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { - return vreinterpretq_u16_s16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { - return vreinterpret_u16_s16(a); + return preinterpret(a); } template <> @@ -559,7 +666,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -583,11 +690,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -609,11 +716,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { - return vreinterpretq_s16_u16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { - return vreinterpret_s16_u16(a); + return preinterpret(a); } template <> @@ -635,11 +742,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { - return vreinterpretq_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { - return vreinterpret_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } //============================================================================== @@ -674,7 +781,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -696,11 +803,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { - return vreinterpretq_u32_s32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { - return vreinterpret_u32_s32(a); + return preinterpret(a); } template <> @@ -799,7 +906,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -821,11 +928,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { - return vreinterpret_s32_u32(a); + return preinterpret(a); } template <> @@ -847,11 +954,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { - return vreinterpretq_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { - return vreinterpret_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -880,12 +987,12 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, const Packet4ui& d) { - return vreinterpretq_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, const Packet2ui& d) { - return vreinterpret_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } //============================================================================== @@ -915,7 +1022,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(a); } template <> @@ -1013,7 +1120,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); + return preinterpret(a); } template <> @@ -1031,7 +1138,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { - return vreinterpretq_s32_u32(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -1053,7 +1160,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d) { - return vreinterpretq_s16_u16(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> @@ -1077,114 +1184,7 @@ template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, const Packet2ul& g, const Packet2ul& h) { - return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); -} - -//============================================================================== -// preinterpret -//============================================================================== -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { - return vreinterpret_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { - return vreinterpret_f32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return vreinterpretq_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { - return vreinterpretq_f32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { - return vreinterpret_s8_u8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { - return vreinterpret_u8_s8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { - return vreinterpretq_u8_s8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { - return vreinterpret_s16_u16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { - return vreinterpretq_s16_u16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { - return vreinterpret_u16_s16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { - return vreinterpretq_u16_s16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { - return vreinterpret_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { - return vreinterpret_s32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return vreinterpretq_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { - return vreinterpret_u32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { - return vreinterpret_u32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { - return vreinterpretq_u32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { - return vreinterpretq_u32_s32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(pcast(a, b, c, d, e, f, g, h)); } #if EIGEN_ARCH_ARM64 @@ -1193,6 +1193,31 @@ EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& // pcast/preinterpret, Double //============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return Packet2d(vreinterpretq_f64_s64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return Packet2d(vreinterpretq_f64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return Packet2l(vreinterpretq_s64_f64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return Packet2ul(vreinterpretq_u64_f64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return Packet2d(vreinterpretq_f64_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return Packet4i(vreinterpretq_s32_f64(a)); +} + template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; @@ -1314,7 +1339,9 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s8(a))); + // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability. + Packet2f tmp = pcast(vget_low_s8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1324,7 +1351,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u8(a))); + Packet2f tmp = pcast(vget_low_u8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1334,7 +1362,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1344,7 +1373,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u16(a))); + Packet2f tmp = pcast(vget_low_u16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { - return vreinterpretq_f64_s64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { - return vreinterpretq_f64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { - return vreinterpretq_s64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { - return vreinterpretq_u64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { - return vreinterpretq_f64_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { - return vreinterpretq_s32_f64(a); -} - #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 8fe22da..e9ab2c4 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -106,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - Packet2cf res; -#ifdef EIGEN_VECTORIZE_SSE3 - res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); -#else - res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); - res.v = _mm_movelh_ps(res.v, res.v); -#endif - return res; + const float re = std::real(from); + const float im = std::imag(from); + return Packet2cf(_mm_set_ps(im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -174,14 +169,9 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for SSE3 and 4 - Packet2cf res = pmul(a, pconj(b)); - __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); + return pdiv_complex(a, b); } - - //---------- double ---------- struct Packet1cd { @@ -299,10 +289,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for SSE3 and 4 - Packet1cd res = pmul(a,pconj(b)); - __m128d s = _mm_mul_pd(b.v,b.v); - return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/* */(const Packet1cd& x) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index db102c7..b485e0d 100644 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -444,7 +444,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -463,7 +463,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -494,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -513,7 +513,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 0b9b33d..3c38066 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -16,7 +16,9 @@ namespace Eigen { namespace internal { #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +inline Packet4ui p4ui_CONJ_XOR() { + return { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); +} #endif static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -91,8 +93,18 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet2cf half; + typedef Packet4f as_real; +}; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet1cd half; + typedef Packet2d as_real; +}; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); @@ -150,7 +162,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); return res; @@ -169,10 +181,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for AltiVec - Packet1cd res = pmul(a,pconj(b)); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) @@ -195,7 +204,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -225,14 +234,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -308,11 +317,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res; - res.cd[0] = pdiv(a.cd[0], b.cd[0]); - res.cd[1] = pdiv(a.cd[1], b.cd[1]); - return res; + return pdiv_complex(a, b); } EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) @@ -342,7 +347,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] }; return (Packet2cf)pand(eq, tmp); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { Packet4f a_re, a_im, prod, prod_im; @@ -355,7 +360,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con // multiply a_im * b and get the conjugate result prod_im = a_im * b.v; - prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); + prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR())); // permute back to a proper order prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); @@ -394,10 +399,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec - Packet2cf res = pmul(a, pconj(b)); - Packet4f s = pmul(b.v, b.v); - return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); + return pdiv_complex(a, b); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 1f55a90..a7b59c8 100644 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -91,8 +91,8 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), - numext::bit_cast0x8000000000000000ull) }; +static Packet2d p2d_ZERO_ = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ @@ -358,7 +358,7 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -368,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); @@ -376,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -386,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -460,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { @@ -639,7 +639,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -649,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; pstore((float *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -785,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) return p; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -943,7 +943,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -953,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore((float*)af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -978,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index f35b760..4c649a2 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -2269,8 +2269,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs0) { Index remaining_rows = rows-i; @@ -2290,21 +2290,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } else if (HasHalf && psize == HalfPacketSize) { gone_half = true; PacketBlock kernel_half; - for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_half); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); } else if (HasQuarter && psize == QuarterPacketSize) { gone_quarter = true; PacketBlock kernel_quarter; - for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_quarter); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); } } count += psize*pack; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index caa65fc..73ddd26 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -59,9 +59,9 @@ typedef gebp_traits Traits; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, - const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) @@ -69,9 +69,9 @@ static void run(Index rows, Index cols, Index depth, typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs, lhsStride); - RhsMapper rhs(_rhs, rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_, lhsStride); + RhsMapper rhs(rhs_, rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 6ba0d9b..dddba6f 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -60,9 +60,9 @@ template { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; @@ -70,9 +70,9 @@ struct general_matrix_matrix_triangular_product LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -113,7 +113,7 @@ struct general_matrix_matrix_triangular_product::ret }; - void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { typedef blas_data_mapper ResMapper; typedef blas_data_mapper BufferMapper; - ResMapper res(_res, resStride, resIncr); + ResMapper res(res_, resStride, resIncr); gebp_kernel gebp_kernel1; gebp_kernel gebp_kernel2; @@ -300,14 +300,19 @@ struct general_product_to_triangular_selector } }; -template -template -EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) -{ - EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); +template +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename TriangularViewImpl<_MatrixType, _Mode, Dense>::TriangularViewType& +TriangularViewImpl<_MatrixType, _Mode, Dense>::_assignProduct( + const ProductType& prod, const typename TriangularViewImpl<_MatrixType, _Mode, Dense>::Scalar& alpha, bool beta) { + EIGEN_STATIC_ASSERT((_Mode & UnitDiag) == 0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); + general_product_to_triangular_selector<_MatrixType, ProductType, _Mode, + internal::traits::InnerSize == 1>::run(derived() + .nestedExpression() + .const_cast_derived(), + prod, alpha, beta); return derived(); } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index dfb6aeb..974a047 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,6 +359,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::type UnsignedIndex; + const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize); + const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf); + const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter); + Index i=0; for(; i(ResScalar(0)), c7 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -393,7 +397,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c3 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -436,7 +440,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c1 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -465,7 +469,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)); ResPacketHalf c0_h = pset1(ResScalar(0)); ResPacketQuarter c0_q = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } ResScalar cc0 = predux(c0); if (HasHalf) { - for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) { RhsPacketHalf b0 = rhs.template load(j,0); c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); @@ -496,14 +501,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(j,0); c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); } cc0 += predux(c0_q); } - for(; j::type>::half HalfPacket; typedef typename unpacket_traits::type>::half>::half QuarterPacket; @@ -53,7 +53,7 @@ struct symm_pack_lhs HasHalf = (int)HalfPacketSize < (int)PacketSize, HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; - const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper lhs(lhs_,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; @@ -101,11 +101,11 @@ template struct symm_pack_rhs { enum { PacketSize = packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; - const_blas_data_mapper rhs(_rhs,rhsStride); + const_blas_data_mapper rhs(rhs_,rhsStride); Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; @@ -330,8 +330,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -342,9 +342,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -355,10 +355,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - LhsTransposeMapper lhs_transpose(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + LhsTransposeMapper lhs_transpose(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -425,8 +425,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -437,9 +437,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -448,8 +448,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + ResMapper res(res_,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -466,7 +466,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix GEPP for(Index i2=0; i2::IsComplex && Conjugate> cj; -// const_blas_data_mapper lhs(_lhs,lhsStride); +// const_blas_data_mapper lhs(lhs_,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i& blocking); }; @@ -110,9 +110,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -124,9 +124,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -254,8 +254,8 @@ struct product_triangular_matrix_matrix& blocking); }; @@ -268,9 +268,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -283,9 +283,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 76bfa15..0a25748 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -26,30 +26,30 @@ struct triangular_matrix_vector_product EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index size = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : (std::min)(_rows,_cols); - Index cols = IsLower ? (std::min)(_rows,_cols) : _cols; + Index size = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : (std::min)(rows_,cols_); + Index cols = IsLower ? (std::min)(rows_,cols_) : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map, 0, InnerStride<> > RhsMap; - const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); typedef Map > ResMap; - ResMap res(_res,rows); + ResMap res(res_,rows); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; @@ -84,7 +84,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index diagSize = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : diagSize; - Index cols = IsLower ? diagSize : _cols; + Index diagSize = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : diagSize; + Index cols = IsLower ? diagSize : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map > RhsMap; - const RhsMap rhs(_rhs,cols); + const RhsMap rhs(rhs_,cols); typename conj_expr_if::type cjRhs(rhs); typedef Map, 0, InnerStride<> > ResMap; - ResMap res(_res,rows,InnerStride<>(resIncr)); + ResMap res(res_,rows,InnerStride<>(resIncr)); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3d47a2b..0f8d3a1 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -50,18 +50,18 @@ struct triangular_matrix_vector_product_trmv : #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; @@ -81,23 +81,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -121,24 +121,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size*lda; \ + y = res_; \ + a = lhs_ + size*lda; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ @@ -170,23 +170,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -210,24 +210,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size; \ + y = res_; \ + a = lhs_ + size; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index af4e696..7d51426 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -339,7 +339,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 + #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN #include #else #include @@ -363,10 +363,11 @@ #endif } // end extern "C" - #elif defined __VSX__ + #elif defined(__VSX__) && !defined(__APPLE__) #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX + #define EIGEN_VECTORIZE_VSX 1 + #define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -378,6 +379,7 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC + #define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -438,13 +440,20 @@ #include #endif -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) +// Enable FMA for ARM. +#if defined(__ARM_FEATURE_FMA) +#define EIGEN_VECTORIZE_FMA +#endif + +#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C - #if defined(EIGEN_COMP_CLANG) - // Workaround for clang: The FP16C intrinsics for clang are included by - // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + #if EIGEN_COMP_GNUC + // Make sure immintrin.h is included, even if e.g. vectorization is + // explicitly disabled (see also issue #2395). + // Note that FP16C intrinsics for gcc and clang are included by immintrin.h, + // as opposed to emmintrin.h as suggested by Intel: // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 #include #endif diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 35dcaa7..0667b1c 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -134,7 +134,7 @@ const unsigned int LinearAccessBit = 0x10; * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable. * This rules out read-only expressions. * - * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note + * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but not * the other: * \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit * \li Map-to-const expressions, for example Map, have DirectAccessBit but not LvalueBit diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index fe0cfec..0865fb6 100644 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -1,9 +1,10 @@ #ifndef EIGEN_WARNINGS_DISABLED #define EIGEN_WARNINGS_DISABLED -#ifdef _MSC_VER +#if defined(_MSC_VER) // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable + // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -19,7 +20,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -35,25 +36,28 @@ #pragma warning disable 2196 279 1684 2259 #elif defined __clang__ - // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant - // this is really a stupid warning as it warns on compile-time expressions involving enums #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma clang diagnostic push #endif - #pragma clang diagnostic ignored "-Wconstant-logical-operand" - #if __clang_major__ >= 3 && __clang_minor__ >= 5 - #pragma clang diagnostic ignored "-Wabsolute-value" - #endif - #if __clang_major__ >= 10 - #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" - #endif - #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L - // warning: generic selections are a C11-specific feature - // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h - #pragma clang diagnostic ignored "-Wc11-extensions" + #if defined(__has_warning) + // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant + // this is really a stupid warning as it warns on compile-time expressions involving enums + #if __has_warning("-Wconstant-logical-operand") + #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #endif + #if __has_warning("-Wimplicit-int-float-conversion") + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #if __has_warning("-Wc11-extensions") + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif + #endif #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined(__FUJITSU) #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push @@ -74,25 +78,53 @@ #endif #if defined __NVCC__ - #pragma diag_suppress boolean_controlling_expr_is_constant + // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so + // we instead use Microsoft's __pragma extension. + #if defined _MSC_VER + #define EIGEN_MAKE_PRAGMA(X) __pragma(#X) + #else + #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X) + #endif + #if defined __NVCC_DIAG_PRAGMA_SUPPORT__ + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X) + #else + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X) + #endif + + EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant) // Disable the "statement is unreachable" message - #pragma diag_suppress code_is_unreachable + EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable) // Disable the "dynamic initialization in unreachable code" message - #pragma diag_suppress initialization_not_reachable + EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable) // Disable the "invalid error number" message that we get with older versions of nvcc - #pragma diag_suppress 1222 + EIGEN_NV_DIAG_SUPPRESS(1222) // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler) - #pragma diag_suppress 2527 - #pragma diag_suppress 2529 - #pragma diag_suppress 2651 - #pragma diag_suppress 2653 - #pragma diag_suppress 2668 - #pragma diag_suppress 2669 - #pragma diag_suppress 2670 - #pragma diag_suppress 2671 - #pragma diag_suppress 2735 - #pragma diag_suppress 2737 - #pragma diag_suppress 2739 + EIGEN_NV_DIAG_SUPPRESS(2527) + EIGEN_NV_DIAG_SUPPRESS(2529) + EIGEN_NV_DIAG_SUPPRESS(2651) + EIGEN_NV_DIAG_SUPPRESS(2653) + EIGEN_NV_DIAG_SUPPRESS(2668) + EIGEN_NV_DIAG_SUPPRESS(2669) + EIGEN_NV_DIAG_SUPPRESS(2670) + EIGEN_NV_DIAG_SUPPRESS(2671) + EIGEN_NV_DIAG_SUPPRESS(2735) + EIGEN_NV_DIAG_SUPPRESS(2737) + EIGEN_NV_DIAG_SUPPRESS(2739) + EIGEN_NV_DIAG_SUPPRESS(2885) + EIGEN_NV_DIAG_SUPPRESS(2888) + EIGEN_NV_DIAG_SUPPRESS(2976) + EIGEN_NV_DIAG_SUPPRESS(2979) + EIGEN_NV_DIAG_SUPPRESS(20011) + EIGEN_NV_DIAG_SUPPRESS(20014) + // Disable the "// __device__ annotation is ignored on a function(...) that is + // explicitly defaulted on its first declaration" message. + // The __device__ annotation seems to actually be needed in some cases, + // otherwise resulting in kernel runtime errors. + EIGEN_NV_DIAG_SUPPRESS(2886) + EIGEN_NV_DIAG_SUPPRESS(2977) + EIGEN_NV_DIAG_SUPPRESS(20012) + #undef EIGEN_NV_DIAG_SUPPRESS + #undef EIGEN_MAKE_PRAGMA #endif #else diff --git a/Eigen/src/Core/util/IndexedViewHelper.h b/Eigen/src/Core/util/IndexedViewHelper.h index f85de30..0d64d5e 100644 --- a/Eigen/src/Core/util/IndexedViewHelper.h +++ b/Eigen/src/Core/util/IndexedViewHelper.h @@ -168,7 +168,7 @@ template struct get_compile_time_incr > { * \ingroup Core_Module * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns */ -static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all +static const Eigen::internal::all_t all; namespace placeholders { diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index 945d426..e0092f6 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -138,7 +138,7 @@ template struct get_fixed_value,Default> { static const int value = N; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template struct get_fixed_value (*)(),Default> { static const int value = N; }; @@ -154,7 +154,7 @@ struct get_fixed_value,Default> { }; template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif @@ -166,7 +166,7 @@ template struct clea // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index template struct cleanup_index_type::value>::type> { typedef Index type; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 986c3d4..eb88e5f 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -17,7 +17,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 4 -#define EIGEN_MINOR_VERSION 0 +#define EIGEN_MINOR_VERSION 1 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -179,6 +179,13 @@ #define EIGEN_COMP_PGI 0 #endif +/// \internal EIGEN_COMP_NVHPC set to NVHPC version if the compiler is nvc++ +#if defined(__NVCOMPILER) +#define EIGEN_COMP_NVHPC (__NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__) +#else +#define EIGEN_COMP_NVHPC 0 +#endif + /// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler #if defined(__CC_ARM) || defined(__ARMCC_VERSION) #define EIGEN_COMP_ARM 1 @@ -275,7 +282,7 @@ /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE /// compliant Arm fp16 type -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16 #if defined(__ARM_FP16_FORMAT_IEEE) #define EIGEN_HAS_ARM64_FP16 1 @@ -287,7 +294,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture /// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 @@ -299,7 +306,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture /// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 @@ -329,7 +336,7 @@ #endif /// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC -#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) +#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__) #define EIGEN_ARCH_PPC 1 #else #define EIGEN_ARCH_PPC 0 @@ -565,6 +572,32 @@ // #endif +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways, even in GCC (e.g. vceqh_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. // In most cases we want to check if both macros are defined which can be done using the define below. @@ -1124,14 +1157,28 @@ namespace Eigen { // directly for std::complex, Eigen::half, Eigen::bfloat16. For these, // you will need to apply to the underlying POD type. #if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT - // This seems to be broken on clang. Packet4f is loaded into a single - // register rather than a vector, zeroing out some entries. Integer + // This seems to be broken on clang. Packet4f is loaded into a single + // register rather than a vector, zeroing out some entries. Integer // types also generate a compile error. - // General, Altivec, VSX. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #if EIGEN_OS_MAC + // General, Altivec for Apple (VSX were added in ISA v2.06): + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v" (X)); + #else + // General, Altivec, VSX otherwise: + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); + #endif #elif EIGEN_ARCH_ARM_OR_ARM64 // General, NEON. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + // Clang doesn't like "r", + // error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type + // GCC < 5 doesn't like "g", + // error: 'asm' operand requires impossible reload + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0) + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,w" (X)); + #else + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #endif #elif EIGEN_ARCH_i386_OR_x86_64 // General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); @@ -1185,7 +1232,7 @@ namespace Eigen { #define EIGEN_USING_STD(FUNC) using std::FUNC; #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1916 || (EIGEN_COMP_MSVC == 1916 && EIGEN_COMP_NVCC)) // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, // otherwise we get duplicate definition errors // For later MSVC versions, we require explicit operator= definition, otherwise we get @@ -1216,7 +1263,7 @@ namespace Eigen { * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ #if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; #else #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) #endif @@ -1241,12 +1288,12 @@ namespace Eigen { */ #if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() = default; \ - ~Derived() = default; + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; #else #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() {}; \ - /* ~Derived() {}; */ + EIGEN_DEVICE_FUNC Derived() {}; \ + /* EIGEN_DEVICE_FUNC ~Derived() {}; */ #endif diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 875318c..3aea7df 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -292,20 +292,59 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) +template EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size) { - std::size_t i; + std::size_t i=0; EIGEN_TRY { - for (i = 0; i < size; ++i) ::new (ptr + i) T; - return ptr; + for (i = 0; i < size; ++i) ::new (ptr + i) T; } EIGEN_CATCH(...) { destruct_elements_of_array(ptr, i); EIGEN_THROW; } - return NULL; + return ptr; +} + +/** \internal Copy-constructs the elements of an array. + * The \a size parameter tells on how many objects to copy. + */ +template EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; +} + +/** \internal Move-constructs the elements of an array. + * The \a size parameter tells on how many objects to move. + */ +template EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { +#if EIGEN_HAS_RVALUE_REFERENCES + for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i))); +#else + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); +#endif + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; } /***************************************************************************** @@ -326,10 +365,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); + T *result = static_cast(aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -342,10 +381,10 @@ template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -377,21 +416,32 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned { check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(new_size < old_size) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(new_size > old_size) + + // If elements need to be explicitly initialized, we cannot simply realloc + // (or memcpy) the memory block - each element needs to be reconstructed. + // Otherwise, objects that contain internal pointers like mpfr or + // AnnoyingScalar can be pointing to the wrong thing. + T* result = static_cast(conditional_aligned_malloc(sizeof(T)*new_size)); + EIGEN_TRY { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; + // Move-construct initial elements. + std::size_t copy_size = (std::min)(old_size, new_size); + move_construct_elements_of_array(result, pts, copy_size); + + // Default-construct remaining elements. + if (new_size > old_size) { + default_construct_elements_of_array(result + copy_size, new_size - old_size); } + + // Delete old elements. + conditional_aligned_delete(pts, old_size); } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + return result; } @@ -401,12 +451,12 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned if(size==0) return 0; // short-cut. Also fixes Bug 884 check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); if(NumTraits::RequireInitialization) { EIGEN_TRY { - construct_elements_of_array(result, size); + default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -419,24 +469,13 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) { + if (NumTraits::RequireInitialization) { + return conditional_aligned_realloc_new(pts, new_size, old_size); + } + check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(NumTraits::RequireInitialization && (new_size < old_size)) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(NumTraits::RequireInitialization && (new_size > old_size)) - { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; - } - } - return result; + return static_cast(conditional_aligned_realloc(static_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); } template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) @@ -617,7 +656,7 @@ template class aligned_stack_memory_handler : noncopyable : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) - Eigen::internal::construct_elements_of_array(m_ptr, size); + Eigen::internal::default_construct_elements_of_array(m_ptr, size); } EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() @@ -668,7 +707,7 @@ struct local_nested_eval_wrapper m_deallocate(ptr==0) { if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::construct_elements_of_array(object.data(), object.size()); + Eigen::internal::default_construct_elements_of_array(object.data(), object.size()); object = xpr; } diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 81ae2a3..de1a638 100644 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -133,7 +133,10 @@ template struct remove_all { typedef typename remove_all< template struct is_arithmetic { enum { value = false }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct is_arithmetic { enum { value = true }; }; +#endif template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -167,10 +170,8 @@ template<> struct is_integral { enum { value = true }; } template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; -#if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -#endif +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #endif #if EIGEN_HAS_CXX11 @@ -189,21 +190,9 @@ template<> struct make_unsigned { typedef unsigned int type; } template<> struct make_unsigned { typedef unsigned int type; }; template<> struct make_unsigned { typedef unsigned long type; }; template<> struct make_unsigned { typedef unsigned long type; }; -#if EIGEN_COMP_MSVC -template<> struct make_unsigned { typedef unsigned __int64 type; }; -template<> struct make_unsigned { typedef unsigned __int64 type; }; -#endif - -// Some platforms define int64_t as `long long` even for C++03, where -// `long long` is not guaranteed by the standard. In this case we are missing -// the definition for make_unsigned. If we just define it, we run into issues -// where `long long` doesn't exist in some compilers for C++03. We therefore add -// the specialization for these platforms only. -#if EIGEN_OS_MAC || EIGEN_COMP_MINGW template<> struct make_unsigned { typedef unsigned long long type; }; template<> struct make_unsigned { typedef unsigned long long type; }; #endif -#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -466,20 +455,32 @@ template struct array_size > { }; #endif + /** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T + * Analogue of the std::ssize free function. + * It returns the signed size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * + * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -template -EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } +#if EIGEN_COMP_CXXVER < 20 +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T& x) { + return static_cast(x.size()); +} -template -EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } +#else +template +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward(x)); +} +#endif // EIGEN_COMP_CXXVER /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or @@ -696,8 +697,7 @@ struct has_binary_operator template Y))) > - // use ?: instead of || just to shut up a stupid gcc 4.3 warning + bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))> class meta_sqrt { enum { diff --git a/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/Eigen/src/Eigenvalues/ComplexEigenSolver.h index 081e918..6a99f4c 100644 --- a/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -316,9 +316,8 @@ void ComplexEigenSolver::doComputeEigenvectors(RealScalar matrixnorm // Compute V as V = U X; now A = U T U^* = U X D X^(-1) U^* = V D V^(-1) m_eivec.noalias() = m_schur.matrixU() * m_matX; // .. and normalize the eigenvectors - for(Index k=0 ; k class GeneralizedEigenSolver : m_eivec(), m_alphas(), m_betas(), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ() {} @@ -134,8 +134,8 @@ template class GeneralizedEigenSolver : m_eivec(size, size), m_alphas(size), m_betas(size), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(size), m_tmp(size) {} @@ -156,8 +156,8 @@ template class GeneralizedEigenSolver : m_eivec(A.rows(), A.cols()), m_alphas(A.cols()), m_betas(A.cols()), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(A.cols()), m_tmp(A.cols()) { @@ -177,7 +177,8 @@ template class GeneralizedEigenSolver * \sa eigenvalues() */ EigenvectorsType eigenvectors() const { - eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors"); + eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated"); return m_eivec; } @@ -201,7 +202,7 @@ template class GeneralizedEigenSolver */ EigenvalueType eigenvalues() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues."); return EigenvalueType(m_alphas,m_betas); } @@ -212,7 +213,7 @@ template class GeneralizedEigenSolver * \sa betas(), eigenvalues() */ ComplexVectorType alphas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas."); return m_alphas; } @@ -223,7 +224,7 @@ template class GeneralizedEigenSolver * \sa alphas(), eigenvalues() */ VectorType betas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas."); return m_betas; } @@ -254,7 +255,7 @@ template class GeneralizedEigenSolver ComputationInfo info() const { - eigen_assert(m_valuesOkay && "EigenSolver is not initialized."); + eigen_assert(m_isInitialized && "EigenSolver is not initialized."); return m_realQZ.info(); } @@ -277,7 +278,8 @@ template class GeneralizedEigenSolver EigenvectorsType m_eivec; ComplexVectorType m_alphas; VectorType m_betas; - bool m_valuesOkay, m_vectorsOkay; + bool m_computeEigenvectors; + bool m_isInitialized; RealQZ m_realQZ; ComplexVectorType m_tmp; }; @@ -292,8 +294,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp using std::abs; eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows()); Index size = A.cols(); - m_valuesOkay = false; - m_vectorsOkay = false; // Reduce to generalized real Schur form: // A = Q S Z and B = Q T Z m_realQZ.compute(A, B, computeEigenvectors); @@ -406,10 +406,9 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp i += 2; } } - - m_valuesOkay = true; - m_vectorsOkay = computeEigenvectors; } + m_computeEigenvectors = computeEigenvectors; + m_isInitialized = true; return *this; } diff --git a/Eigen/src/Eigenvalues/RealSchur.h b/Eigen/src/Eigenvalues/RealSchur.h index 7304ef3..37394e1 100644 --- a/Eigen/src/Eigenvalues/RealSchur.h +++ b/Eigen/src/Eigenvalues/RealSchur.h @@ -435,34 +435,33 @@ inline void RealSchur::computeShift(Index iu, Index iter, Scalar& ex shiftInfo.coeffRef(1) = m_matT.coeff(iu-1,iu-1); shiftInfo.coeffRef(2) = m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu); - // Wilkinson's original ad hoc shift - if (iter == 10) - { - exshift += shiftInfo.coeff(0); - for (Index i = 0; i <= iu; ++i) - m_matT.coeffRef(i,i) -= shiftInfo.coeff(0); - Scalar s = abs(m_matT.coeff(iu,iu-1)) + abs(m_matT.coeff(iu-1,iu-2)); - shiftInfo.coeffRef(0) = Scalar(0.75) * s; - shiftInfo.coeffRef(1) = Scalar(0.75) * s; - shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s; - } - - // MATLAB's new ad hoc shift - if (iter == 30) - { - Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0); - s = s * s + shiftInfo.coeff(2); - if (s > Scalar(0)) - { - s = sqrt(s); - if (shiftInfo.coeff(1) < shiftInfo.coeff(0)) - s = -s; - s = s + (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0); - s = shiftInfo.coeff(0) - shiftInfo.coeff(2) / s; - exshift += s; + // Alternate exceptional shifting strategy every 16 iterations. + if (iter % 16 == 0) { + // Wilkinson's original ad hoc shift + if (iter % 32 != 0) { + exshift += shiftInfo.coeff(0); for (Index i = 0; i <= iu; ++i) - m_matT.coeffRef(i,i) -= s; - shiftInfo.setConstant(Scalar(0.964)); + m_matT.coeffRef(i,i) -= shiftInfo.coeff(0); + Scalar s = abs(m_matT.coeff(iu,iu-1)) + abs(m_matT.coeff(iu-1,iu-2)); + shiftInfo.coeffRef(0) = Scalar(0.75) * s; + shiftInfo.coeffRef(1) = Scalar(0.75) * s; + shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s; + } else { + // MATLAB's new ad hoc shift + Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0); + s = s * s + shiftInfo.coeff(2); + if (s > Scalar(0)) + { + s = sqrt(s); + if (shiftInfo.coeff(1) < shiftInfo.coeff(0)) + s = -s; + s = s + (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0); + s = shiftInfo.coeff(0) - shiftInfo.coeff(2) / s; + exshift += s; + for (Index i = 0; i <= iu; ++i) + m_matT.coeffRef(i,i) -= s; + shiftInfo.setConstant(Scalar(0.964)); + } } } } diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 674c92a..eda8279 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -440,9 +440,8 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal template struct tridiagonalization_inplace_selector { - typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; - template + template static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { diff --git a/Eigen/src/Geometry/Transform.h b/Eigen/src/Geometry/Transform.h index 52b8c2a..a7756be 100644 --- a/Eigen/src/Geometry/Transform.h +++ b/Eigen/src/Geometry/Transform.h @@ -985,7 +985,10 @@ Transform::preshear(const Scalar& sx, const Scalar& sy) { EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE) EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS) - m_matrix.template block(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block(0,0); + LinearMatrixType shear = LinearMatrixType::Identity(2, 2); + shear.coeffRef(0, 1) = sy; + shear.coeffRef(1, 0) = sx; + m_matrix.template block(0, 0) = shear * m_matrix.template block(0, 0); return *this; } diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 6b75500..2a5c395 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -136,8 +136,10 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) - S(m-1) = -1; + if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) { + Index tmp = m - 1; + S(tmp) = -1; + } // Eq. (40) and (43) Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 5bc037f..d8984a3 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -69,7 +69,7 @@ void MatrixBase::makeHouseholder( Scalar& tau, RealScalar& beta) const { - using std::sqrt; + using numext::sqrt; using numext::conj; EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart) diff --git a/Eigen/src/Householder/HouseholderSequence.h b/Eigen/src/Householder/HouseholderSequence.h index 022f6c3..b9c4aee 100644 --- a/Eigen/src/Householder/HouseholderSequence.h +++ b/Eigen/src/Householder/HouseholderSequence.h @@ -14,7 +14,9 @@ namespace Eigen { /** \ingroup Householder_Module + * * \householder_module + * * \class HouseholderSequence * \brief Sequence of Householder reflections acting on subspaces with decreasing size * \tparam VectorsType type of matrix containing the Householder vectors @@ -518,7 +520,10 @@ typename internal::matrix_type_times_scalar_type householderSequence(const VectorsTyp return HouseholderSequence(v, h); } -/** \ingroup Householder_Module \householder_module +/** \ingroup Householder_Module + * + * \householder_module + * * \brief Convenience function for constructing a Householder sequence. * \returns A HouseholderSequence constructed from the specified arguments. * \details This function differs from householderSequence() in that the template argument \p OnTheSide of diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 153acef..1c9ade5 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -49,9 +49,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, x.setZero(); return true; } - Scalar rho = 1; - Scalar alpha = 1; - Scalar w = 1; + Scalar rho (1); + Scalar alpha (1); + Scalar w (1); VectorType v = VectorType::Zero(n), p = VectorType::Zero(n); VectorType y(n), z(n); diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 5d8c6b4..c3ca0ad 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -29,8 +29,6 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters, typename Dest::RealScalar& tol_error) { - using std::sqrt; - using std::abs; typedef typename Dest::RealScalar RealScalar; typedef typename Dest::Scalar Scalar; typedef Matrix VectorType; @@ -56,7 +54,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, if (residualNorm2 < threshold) { iters = 0; - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); return; } @@ -86,7 +84,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, p = z + beta * p; // update search direction i++; } - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); iters = i; } diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 7803fd8..5e632c4 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -160,13 +160,13 @@ class IncompleteCholesky : public SparseSolverBase(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; - const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f}; + const Packet4f p4f_sign_PNNP = pload(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); iB = pmul(iB, rd); @@ -326,10 +333,10 @@ struct compute_inverse_size4(0x8000000000000000ull)}; - const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; - const Packet2d sign_PN = ploadu(sign_mask1); - const Packet2d sign_NP = ploadu(sign_mask2); + EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0}; + EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0}; + const Packet2d sign_PN = pload(sign_mask1); + const Packet2d sign_NP = pload(sign_mask2); d1 = pxor(rd, sign_PN); d2 = pxor(rd, sign_NP); @@ -348,4 +355,9 @@ struct compute_inverse_size4 > int m_ordering; // Ordering method to use, see SPQR's manual int m_allow_tol; // Allow to use some tolerance during numerical factorization. RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero - mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format + mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format mutable MatrixType m_R; // The sparse matrix R in Eigen format - mutable StorageIndex *m_E; // The permutation applied to columns - mutable cholmod_sparse *m_H; //The householder vectors - mutable StorageIndex *m_HPinv; // The row permutation of H - mutable cholmod_dense *m_HTau; // The Householder coefficients + mutable StorageIndex *m_E = nullptr; // The permutation applied to columns + mutable cholmod_sparse *m_H = nullptr; //The householder vectors + mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H + mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters bool m_useDefaultThreshold; // Use default threshold diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 17f8e44..79a6562 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -27,6 +27,10 @@ #define eigen_internal_assert(X) assert(X); #endif +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE +#include +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -172,7 +176,7 @@ public: void setSwitchSize(int s) { - eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3"); + eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3."); m_algoswap = s; } @@ -404,7 +408,7 @@ void BDCSVD::structured_update(Block A, co //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; // lastCol + 1 - firstCol is the size of the submatrix. //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W) -//@param firstRowW : Same as firstRowW with the column. +//@param firstColW : Same as firstRowW with the column. //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template @@ -899,7 +903,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft::computeSingVals(const ArrayRef& col0, const ArrayRef& d // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - -// if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); -// if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); + // if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); + // if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); } } @@ -1029,7 +1033,14 @@ void BDCSVD::perturbCol0 std::cout << " " << "j=" << j << "\n"; } #endif - Index j = i= k && l == 0) { + m_info = NumericalIssue; + prod = 0; + break; + } + Index j = i 0 ? perm(l-1) : i; #ifdef EIGEN_BDCSVD_SANITY_CHECKS if(!(dk!=Literal(0) || diag(i)!=Literal(0))) { @@ -1242,8 +1253,8 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif { // Check for total deflation - // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array()::compute(const MatrixType& matrix, unsig if (!(numext::isfinite)(scale)) { m_isInitialized = true; m_info = InvalidInput; + m_nonzeroSingularValues = 0; return *this; } if(scale==RealScalar(0)) scale = RealScalar(1); diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 997defc..a5b2f60 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -161,13 +161,14 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; - enum { StorageOrder = traits::Flags & RowMajorBit }; - typedef InnerStride ColInnerStride; - typedef InnerStride RowInnerStride; + static const int StorageOrder = + (traits::Flags & RowMajorBit) ? RowMajor : ColMajor; + typedef InnerStride ColInnerStride; + typedef InnerStride RowInnerStride; typedef Ref, 0, ColInnerStride> SubColumnType; typedef Ref, 0, RowInnerStride> SubRowType; typedef Ref > SubMatType; - + Index brows = A.rows(); Index bcols = A.cols(); @@ -293,7 +294,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona Index size = (std::min)(rows, cols); // X and Y are work space - enum { StorageOrder = traits::Flags & RowMajorBit }; + enum { StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor }; Matrix, IteratorBa enum { IsRowMajor = XprType::IsRowMajor, - - OuterVector = (BlockCols==1 && ArgType::IsRowMajor) - | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". - // revert to || as soon as not needed anymore. - (BlockRows==1 && !ArgType::IsRowMajor), - + OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor), CoeffReadCost = evaluator::CoeffReadCost, Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index f99be33..6247d79 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -237,6 +237,7 @@ class Map /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * The inner indices must be sorted appropriately. * * This constructor is available only if \c SparseMatrixType is non-const. * diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 616b4a0..5522769 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -781,18 +781,17 @@ class SparseMatrix return *this; } -#ifndef EIGEN_PARSED_BY_DOXYGEN template inline SparseMatrix& operator=(const EigenBase& other) { return Base::operator=(other.derived()); } template inline SparseMatrix& operator=(const Product& other); -#endif // EIGEN_PARSED_BY_DOXYGEN template EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase& other); +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m) { EIGEN_DBG_SPARSE( @@ -837,6 +836,7 @@ class SparseMatrix s << static_cast&>(m); return s; } +#endif /** Destructor */ inline ~SparseMatrix() diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 229449f..417a236 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -113,7 +113,7 @@ template class SparseMatrixBase Transpose >::type AdjointReturnType; typedef Transpose TransposeReturnType; - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; // FIXME storage order do not match evaluator storage order typedef SparseMatrix PlainObject; @@ -214,7 +214,7 @@ template class SparseMatrixBase inline void assignGeneric(const OtherDerived& other); public: - +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m) { typedef typename Derived::Nested Nested; @@ -263,6 +263,7 @@ template class SparseMatrixBase } return s; } +#endif template Derived& operator+=(const SparseMatrixBase& other); diff --git a/Eigen/src/SparseCore/SparseProduct.h b/Eigen/src/SparseCore/SparseProduct.h index af8a774..f55e8ff 100644 --- a/Eigen/src/SparseCore/SparseProduct.h +++ b/Eigen/src/SparseCore/SparseProduct.h @@ -165,6 +165,7 @@ protected: } // end namespace internal // sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.) + template template SparseMatrix& SparseMatrix::operator=(const Product& src) diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 88820a4..25ce404 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -90,9 +90,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res.swap(res_); } }; @@ -104,9 +104,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; - SparseTemporaryType _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res = _res; + SparseTemporaryType res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res = res_; } }; @@ -117,9 +117,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, res_, tolerance); + res.swap(res_); } }; @@ -137,9 +137,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; -// SparseTemporaryType _res(res.cols(), res.rows()); -// sparse_sparse_product_with_pruning_impl(rhs, lhs, _res); -// res = _res.transpose(); +// SparseTemporaryType res_(res.cols(), res.rows()); +// sparse_sparse_product_with_pruning_impl(rhs, lhs, res_); +// res = res_.transpose(); } }; diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 05779be..106925b 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -329,6 +329,7 @@ class SparseVector } #endif +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseVector& m) { for (Index i=0; i } - Index count = 0; +// Index count = 0; // FIXME compute a reference value to filter zeros for (typename AmbiVector::Iterator it(tempVector/*,1e-12*/); it; ++it) { - ++ count; +// ++ count; // std::cerr << "fill " << it.index() << ", " << col << "\n"; // std::cout << it.value() << " "; // FIXME use insertBack diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 0c8d893..6eb7950 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -35,9 +35,10 @@ public: MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - SparseLUTransposeView() : m_sparseLU(NULL) {} - SparseLUTransposeView(const SparseLUTransposeView& view) { + SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() { this->m_sparseLU = view.m_sparseLU; + this->m_isInitialized = view.m_isInitialized; } void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;} @@ -752,10 +753,13 @@ void SparseLU::factorize(const MatrixType& matrix) info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu); if ( info ) { - m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT "; + m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR"; +#ifndef EIGEN_NO_IO std::ostringstream returnInfo; - returnInfo << info; + returnInfo << " ... ZERO COLUMN AT "; + returnInfo << info; m_lastError += returnInfo.str(); +#endif m_info = NumericalIssue; m_factorizationIsOk = false; return; @@ -830,7 +834,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator template void solveInPlace(MatrixBase &X) const { Index nrhs = X.cols(); - Index n = X.rows(); // Backward solve with U for (Index k = m_mapL.nsuper(); k >= 0; k--) { @@ -850,7 +853,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); U = A.template triangularView().solve(U); } @@ -873,7 +876,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { using numext::conj; Index nrhs = X.cols(); - Index n = X.rows(); // Forward solve with U for (Index k = 0; k <= m_mapL.nsuper(); k++) { @@ -904,7 +906,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator else { Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = A.adjoint().template triangularView().solve(U); else diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h index cf5ec44..16a0c41 100644 --- a/Eigen/src/SparseLU/SparseLU_Structs.h +++ b/Eigen/src/SparseLU/SparseLU_Structs.h @@ -70,8 +70,8 @@ #define EIGEN_LU_STRUCTS namespace Eigen { namespace internal { - -typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; + +enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}; template struct LU_GlobalLU_t { diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 0be293d..fd5e9fa 100644 --- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -274,9 +274,8 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co // Triangular solve Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); - U = A.template triangularView().solve(U); - + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); + U = A.template triangularView().solve(U); // Matrix-vector product new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); work.topRows(nrow).noalias() = A * U; @@ -349,7 +348,7 @@ void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = U - A.adjoint() * work.topRows(nrow); else diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h deleted file mode 100644 index e37c2fe..0000000 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ /dev/null @@ -1,280 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2012 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H -#define EIGEN_SPARSELU_GEMM_KERNEL_H - -namespace Eigen { - -namespace internal { - - -/** \internal - * A general matrix-matrix product kernel optimized for the SparseLU factorization. - * - A, B, and C must be column major - * - lda and ldc must be multiples of the respective packet size - * - C must have the same alignment as A - */ -template -EIGEN_DONT_INLINE -void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc) -{ - using namespace Eigen::internal; - - typedef typename packet_traits::type Packet; - enum { - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - PacketSize = packet_traits::size, - PM = 8, // peeling in M - RN = 2, // register blocking - RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking - BM = 4096/sizeof(Scalar), // number of rows of A-C per chunk - SM = PM*PacketSize // step along M - }; - Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking - Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once - Index i0 = internal::first_default_aligned(A,m); - - eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m))); - - // handle the non aligned rows of A and C without any optimization: - for(Index i=0; i(BM, m-ib); // actual number of rows - Index actual_b_end1 = (actual_b/SM)*SM; // actual number of rows suitable for peeling - Index actual_b_end2 = (actual_b/PacketSize)*PacketSize; // actual number of rows suitable for vectorization - - // Let's process two columns of B-C at once - for(Index j=0; j(Bc0[0]); } - { b10 = pset1(Bc0[1]); } - if(RK==4) { b20 = pset1(Bc0[2]); } - if(RK==4) { b30 = pset1(Bc0[3]); } - { b01 = pset1(Bc1[0]); } - { b11 = pset1(Bc1[1]); } - if(RK==4) { b21 = pset1(Bc1[2]); } - if(RK==4) { b31 = pset1(Bc1[3]); } - - Packet a0, a1, a2, a3, c0, c1, t0, t1; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(j+0)*ldc; - Scalar* C1 = C+ib+(j+1)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);} -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - c1 = pload(C1+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - KMADD(c1, a0, b01, t1) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - KMADD(c1, a1, b11, t1) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ KMADD(c1, a2, b21, t1) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ KMADD(c1, a3, b31, t1) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); \ - pstore(C1+i+(I)*PacketSize, c1) - - // process rows of A' - C' with aggressive vectorization and peeling - for(Index i=0; i0) - { - const Scalar* Bc0 = B+(n-1)*ldb; - - for(Index k=0; k(Bc0[0]); - b10 = pset1(Bc0[1]); - if(RK==4) b20 = pset1(Bc0[2]); - if(RK==4) b30 = pset1(Bc0[3]); - - Packet a0, a1, a2, a3, c0, t0/*, t1*/; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(n_end)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); - - // aggressive vectorization and peeling - for(Index i=0; i0) - { - for(Index j=0; j1 ? Aligned : 0 - }; - typedef Map, Alignment > MapVector; - typedef Map, Alignment > ConstMapVector; - if(rd==1) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b); - - else if(rd==2) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b); - - else MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b) - + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b); - } - } - - } // blocking on the rows of A and C -} -#undef KMADD - -} // namespace internal - -} // namespace Eigen - -#endif // EIGEN_SPARSELU_GEMM_KERNEL_H diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h index 6f75d50..7aecbca 100644 --- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h @@ -75,8 +75,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe // Identify the relaxed supernodes by postorder traversal of the etree Index snode_start; // beginning of a snode StorageIndex k; - Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree - Index nsuper_et = 0; // Number of relaxed snodes in the original etree StorageIndex l; for (j = 0; j < n; ) { @@ -88,7 +86,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe parent = et(j); } // Found a supernode in postordered etree, j is the last column - ++nsuper_et_post; k = StorageIndex(n); for (Index i = snode_start; i <= j; ++i) k = (std::min)(k, inv_post(i)); @@ -97,7 +94,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe { // This is also a supernode in the original etree relax_end(k) = l; // Record last column - ++nsuper_et; } else { @@ -107,7 +103,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe if (descendants(i) == 0) { relax_end(l) = l; - ++nsuper_et; } } } diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index 8c1b3e8..7a101ea 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -69,8 +69,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod::run(const Index seg Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize; Map, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); - l.setZero(); - internal::sparselu_gemm(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride()); + l.noalias() = B * u; // Scatter tempv[] into SPA dense[] as a temporary storage isub = lptr + no_zeros; diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index f052001..92cdb0e 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -148,8 +148,7 @@ void SparseLUImpl::panel_bmod(const Index m, const Index w, Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize; MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); - L.setZero(); - internal::sparselu_gemm(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride()); + L.noalias() = B * U; // scatter U and L u_col = 0; diff --git a/Eigen/src/SparseQR/SparseQR.h b/Eigen/src/SparseQR/SparseQR.h index d1fb96f..07802f4 100644 --- a/Eigen/src/SparseQR/SparseQR.h +++ b/Eigen/src/SparseQR/SparseQR.h @@ -69,7 +69,7 @@ namespace internal { * detailed in the following paper: * * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing - * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011. + * Sparse QR Factorization", ACM Trans. on Math. Soft. 38(1), 2011. * * Even though it is qualified as "rank-revealing", this strategy might fail for some * rank deficient problems. When this class is used to solve linear or least-square problems diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 0e5d544..6a79563 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -30,15 +30,40 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa max() */ -EIGEN_MAKE_CWISE_BINARY_OP(min,min) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const OtherDerived &other) const +{ + return (min)(other); +} /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other * * \sa max() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, - const CwiseNullaryOp, PlainObject> > +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN min #else @@ -46,7 +71,20 @@ min #endif (const Scalar &other) const { - return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const Scalar &other) const +{ + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of \c *this and \a other @@ -56,14 +94,39 @@ min * * \sa min() */ -EIGEN_MAKE_CWISE_BINARY_OP(max,max) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const OtherDerived &other) const +{ + return (max)(other); +} /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other * * \sa min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN max @@ -72,13 +135,23 @@ max #endif (const Scalar &other) const { - return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const Scalar &other) const +{ + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other - * - * Example: \include Cwise_absolute_difference.cpp - * Output: \verbinclude Cwise_absolute_difference.out * * \sa absolute_difference() */ diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 5bfb19a..15c35b0 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); + internal::index_list_size(actualRowIndices), + internal::index_list_size(actualColIndices)); } // The following overload returns a Scalar @@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); + (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); } template diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h index a0feef8..514d83a 100644 --- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -72,23 +72,39 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa class CwiseBinaryOp, max() */ -template +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return cwiseMin(other); } /** \returns an expression of the coefficient-wise min of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMin(const Scalar &other) const { - return cwiseMin(Derived::Constant(rows(), cols(), other)); + return cwiseMin(Derived::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMin(const Scalar &other) const +{ + return cwiseMin(Derived::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of *this and \a other @@ -98,23 +114,39 @@ cwiseMin(const Scalar &other) const * * \sa class CwiseBinaryOp, min() */ -template +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return cwiseMax(other); } /** \returns an expression of the coefficient-wise max of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMax(const Scalar &other) const { - return cwiseMax(Derived::Constant(rows(), cols(), other)); + return cwiseMax(Derived::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMax(const Scalar &other) const +{ + return cwiseMax(Derived::Constant(rows(), cols(), other)); }