Extern :

- Eigen passaggio alla versione 3.4.1.
2025-10-10 15:55:14 +02:00
parent 99bc2e75a1
commit 516da6d797
103 changed files with 5374 additions and 2992 deletions
@@ -22,7 +22,7 @@ extern "C" {
  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
  * It provides the two following main factorization classes:
  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
-  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
+  * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
  *
  * For the sake of completeness, this module also propose the two following classes:
  * - class CholmodSimplicialLLT
@@ -83,8 +83,8 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
 #include <sstream>
 #ifndef EIGEN_NO_IO
  #include <sstream>
  #include <iosfwd>
 #endif
 #include <cstring>
@@ -109,7 +109,8 @@
 #endif
 // required for __cpuid, needs to be included after cmath
-#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
+// also required for _BitScanReverse on Windows on ARM
 #if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
  #include <intrin.h>
 #endif
@@ -346,7 +347,7 @@ using std::ptrdiff_t;
 #include "src/Core/CoreIterators.h"
 #include "src/Core/ConditionEstimator.h"
-#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+#if defined(EIGEN_VECTORIZE_VSX)
  #include "src/Core/arch/AltiVec/MatrixProduct.h"
 #elif defined EIGEN_VECTORIZE_NEON
  #include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
@@ -25,8 +25,6 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
@@ -172,7 +172,8 @@ seqN(FirstType first, SizeType size)  {
  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
 }
-#ifdef EIGEN_PARSED_BY_DOXYGEN
+
 #if EIGEN_HAS_CXX11
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
  *
@@ -183,24 +184,6 @@ seqN(FirstType first, SizeType size)  {
  *
  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
  */
 template<typename FirstType,typename LastType, typename IncrType>
 auto seq(FirstType f, LastType l, IncrType incr);
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
  *
  * It is essentially an alias to:
  * \code
  * seqN(f,l-f+1);
  * \endcode
  *
  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
  */
 template<typename FirstType,typename LastType>
 auto seq(FirstType f, LastType l);
 #else // EIGEN_PARSED_BY_DOXYGEN
 #if EIGEN_HAS_CXX11
 template<typename FirstType,typename LastType>
 auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
@@ -211,6 +194,15 @@ auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_in
               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
 }
 /** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
  *
  * It is essentially an alias to:
  * \code
  * seqN(f,l-f+1);
  * \endcode
  *
  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
  */
 template<typename FirstType,typename LastType, typename IncrType>
 auto seq(FirstType f, LastType l, IncrType incr)
  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
@@ -317,26 +309,12 @@ seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<Last
 }
 #endif // EIGEN_HAS_CXX11
-#endif // EIGEN_PARSED_BY_DOXYGEN
+#if EIGEN_HAS_CXX11
 #if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
  * 
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template<typename SizeType,typename IncrType>
 auto lastN(SizeType size, IncrType incr)
 -> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
 {
  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
 }
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
  *
  * \anchor indexing_lastN
  *
  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
  * 
  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
@@ -346,6 +324,21 @@ auto lastN(SizeType size)
 {
  return seqN(Eigen::last+fix<1>()-size, size);
 }
 /** \cpp11
  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
  *
  * \anchor indexing_lastN_with_incr
  *
  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
  * 
  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
 template<typename SizeType,typename IncrType>
 auto lastN(SizeType size, IncrType incr)
 -> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
 {
  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
 }
 #endif
 namespace internal {
@@ -163,7 +163,15 @@ class Array
 #endif
    #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
     *
     * \only_for_vectors
     *
     * This constructor is for 1D array or vectors with more than 4 coefficients.
     * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
     *
     * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
     * constructor must match the the fixed number of rows (resp. columns) of \c *this.
     *
     * Example: \include Array_variadic_ctor_cxx11.cpp
     * Output: \verbinclude Array_variadic_ctor_cxx11.out
@@ -260,19 +260,19 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }
    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const
    {
      return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
    }
    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
    }
    template<int LoadMode>
-    inline PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const
    {
      return m_xpr.template packet<Unaligned>
              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -280,7 +280,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
    }
    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val)
    {
      m_xpr.template writePacket<Unaligned>
         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -334,6 +334,17 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    enum {
      XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
    };
    /** \internal Returns base+offset (unless base is null, in which case returns null).
      * Adding an offset to nullptr is undefined behavior, so we must avoid it.
      */
    template <typename Scalar>
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE
    static Scalar* add_to_nullable_pointer(Scalar* base, Index offset)
    {
      return base != NULL ? base+offset : NULL;
    }
  public:
    typedef MapBase<BlockType> Base;
@@ -344,8 +355,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
-                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+                 i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
                       || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())),
             BlockRows==1 ? 1 : xpr.rows(),
             BlockCols==1 ? 1 : xpr.cols()),
        m_xpr(xpr),
@@ -359,7 +371,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
      */
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))),
        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
@@ -371,7 +384,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
    BlockImpl_dense(XprType& xpr,
          Index startRow, Index startCol,
          Index blockRows, Index blockCols)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+      : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
             blockRows, blockCols),
        m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
    {
      init();
@@ -14,54 +14,56 @@ namespace Eigen {
 namespace internal {
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct all_unroller
 {
  enum {
-    col = (UnrollCount-1) / Rows,
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
-    row = (UnrollCount-1) % Rows
+    i = (UnrollCount-1) / InnerSize,
    j = (UnrollCount-1) % InnerSize
  };
  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
-    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
  }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct all_unroller<Derived, 0, Rows>
+struct all_unroller<Derived, 0, InnerSize>
 {
  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct all_unroller<Derived, Dynamic, Rows>
+struct all_unroller<Derived, Dynamic, InnerSize>
 {
  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct any_unroller
 {
  enum {
-    col = (UnrollCount-1) / Rows,
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
-    row = (UnrollCount-1) % Rows
+    i = (UnrollCount-1) / InnerSize,
    j = (UnrollCount-1) % InnerSize
  };
  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
  {
-    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
  }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct any_unroller<Derived, 0, Rows>
+struct any_unroller<Derived, 0, InnerSize>
 {
  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };
-template<typename Derived, int Rows>
+template<typename Derived, int InnerSize>
-struct any_unroller<Derived, Dynamic, Rows>
+struct any_unroller<Derived, Dynamic, InnerSize>
 {
  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
@@ -85,12 +87,12 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
  else
  {
-    for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < derived().outerSize(); ++i)
-      for(Index i = 0; i < rows(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
-        if (!evaluator.coeff(i, j)) return false;
+        if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false;
    return true;
  }
 }
@@ -109,12 +111,12 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
  };
  Evaluator evaluator(derived());
  if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
  else
  {
-    for(Index j = 0; j < cols(); ++j)
+    for(Index i = 0; i < derived().outerSize(); ++i)
-      for(Index i = 0; i < rows(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
-        if (evaluator.coeff(i, j)) return true;
+        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true;
    return false;
  }
 }
@@ -292,7 +292,7 @@ DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 }
 /**
-  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
+  * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
  * Special version for fixed size types which does not require the size parameter.
  */
 template<typename Derived>
@@ -324,9 +324,9 @@ template<typename Derived> class DenseBase
    typedef Transpose<Derived> TransposeReturnType;
    EIGEN_DEVICE_FUNC
    TransposeReturnType transpose();
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
    EIGEN_DEVICE_FUNC
-    ConstTransposeReturnType transpose() const;
+    const ConstTransposeReturnType transpose() const;
    EIGEN_DEVICE_FUNC
    void transposeInPlace();
@@ -191,7 +191,8 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline
 const typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
  return ConstDiagonalReturnType(derived());
@@ -209,18 +210,18 @@ MatrixBase<Derived>::diagonal() const
  *
  * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return DiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@@ -237,20 +238,20 @@ MatrixBase<Derived>::diagonal(Index index) const
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+inline Diagonal<Derived, Index_>
 MatrixBase<Derived>::diagonal()
 {
-  return typename DiagonalIndexReturnType<Index_>::Type(derived());
+  return Diagonal<Derived, Index_>(derived());
 }
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+inline const Diagonal<const Derived, Index_>
 MatrixBase<Derived>::diagonal() const
 {
-  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
+  return  Diagonal<const Derived, Index_>(derived());
 }
 } // end namespace Eigen
@@ -18,14 +18,9 @@ namespace internal {
 // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
 // looking at the static assertions. Thus this is a trick to get better compile errors.
 template<typename T, typename U,
-// the NeedToTranspose condition here is taken straight from Assign.h
+         bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime &&
-         bool NeedToTranspose = T::IsVectorAtCompileTime
+                ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) ||
-                && U::IsVectorAtCompileTime
+                 (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))>
                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
                         // revert to || as soon as not needed anymore.
                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
 >
 struct dot_nocheck
 {
  typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
@@ -160,7 +160,7 @@ struct eigen_packet_wrapper
 {
  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {};
  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
    m_val = v;
@@ -122,10 +122,10 @@ public:
  {}
  /** \returns number of rows */
-  Index rows() const { return internal::size(m_rowIndices); }
+  Index rows() const { return internal::index_list_size(m_rowIndices); }
  /** \returns number of columns */
-  Index cols() const { return internal::size(m_colIndices); }
+  Index cols() const { return internal::index_list_size(m_colIndices); }
  /** \returns the nested expression */
  const typename internal::remove_all<XprType>::type&
@@ -189,12 +189,16 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  CoeffReturnType coeff(Index row, Index col) const
  {
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
  Scalar& coeffRef(Index row, Index col)
  {
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
@@ -204,6 +208,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
    EIGEN_STATIC_ASSERT_LVALUE(XprType)
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
@@ -212,6 +218,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
  {
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
@@ -220,6 +228,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
  {
    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
    return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
  }
@@ -588,12 +588,8 @@ struct arg_default_impl<Scalar, true> {
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    #if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg.
    // HIP does not seem to have a native device side implementation for the math routine "arg"
    using std::arg;
    #else
    EIGEN_USING_STD(arg);
    #endif
    return static_cast<RealScalar>(arg(x));
  }
 };
@@ -881,13 +877,159 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
  // no value, error at compile time
 };
-template<typename Scalar>
+template <typename BitsType, typename EnableIf = void>
-struct random_default_impl<Scalar, false, true>
+struct count_bits_impl {
-{
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
-  static inline Scalar run(const Scalar& x, const Scalar& y)
+    EIGEN_STATIC_ASSERT(
-  {
+        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
-    if (y <= x)
+        THIS_TYPE_IS_NOT_SUPPORTED);
-      return x;
+    int n = CHAR_BIT * sizeof(BitsType);
    int shift = n / 2;
    while (bits > 0 && shift > 0) {
      BitsType y = bits >> shift;
      if (y > 0) {
        n -= shift;
        bits = y;
      }
      shift /= 2;
    }
    if (shift == 0) {
      --n;
    }
    return n;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(
        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
        THIS_TYPE_IS_NOT_SUPPORTED);
    int n = CHAR_BIT * sizeof(BitsType);
    int shift = n / 2;
    while (bits > 0 && shift > 0) {
      BitsType y = bits << shift;
      if (y > 0) {
        n -= shift;
        bits = y;
      }
      shift /= 2;
    }
    if (shift == 0) {
      --n;
    }
    return n;
  }
 };
 // Count leading zeros.
 template <typename BitsType>
 EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
  return count_bits_impl<BitsType>::clz(bits);
 }
 // Count trailing zeros.
 template <typename BitsType>
 EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
  return count_bits_impl<BitsType>::ctz(bits);
 }
 #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned int)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
  }
 };
 template <typename BitsType>
 struct count_bits_impl<
    BitsType, typename enable_if<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
  }
 };
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) &&
                                                  sizeof(BitsType) <= sizeof(unsigned long long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
    return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
  }
 };
 #elif EIGEN_COMP_MSVC
 template <typename BitsType>
 struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned long)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanReverse(&out, static_cast<unsigned long>(bits));
    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanForward(&out, static_cast<unsigned long>(bits));
    return bits == 0 ? kNumBits : static_cast<int>(out);
  }
 };
 #ifdef _WIN64
 template <typename BitsType>
 struct count_bits_impl<
    BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>::type> {
  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
  }
  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
    unsigned long out;
    _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
    return bits == 0 ? kNumBits : static_cast<int>(out);
  }
 };
 #endif  // _WIN64
 #endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 template <typename Scalar>
 struct random_default_impl<Scalar, false, true> {
  static inline Scalar run(const Scalar& x, const Scalar& y) {
    if (y <= x) return x;
    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
    typedef typename make_unsigned<Scalar>::type ScalarU;
    // ScalarX is the widest of ScalarU and unsigned int.
@@ -1032,11 +1174,15 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
 }
 //MSVC defines a _isnan builtin function, but for double only
 #ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
 #endif
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
 #endif
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
@@ -1050,12 +1196,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_ms
  #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
 #endif
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
 #endif
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
 #endif
 #undef EIGEN_TMP_NOOPT_ATTRIB
@@ -1112,6 +1262,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
  return fmin(x, y);
 }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
@@ -1123,6 +1275,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
  return fminl(x, y);
 #endif
 }
 #endif
 template<typename T>
 EIGEN_DEVICE_FUNC
@@ -1142,6 +1295,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
  return fmax(x, y);
 }
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
@@ -1154,6 +1308,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
 #endif
 }
 #endif
 #endif
 #if defined(SYCL_DEVICE_ONLY)
@@ -1310,8 +1465,8 @@ EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
  return fabs(x - y);
 }
 #if !defined(EIGEN_GPUCC)
 // HIP and CUDA do not support long double.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
@@ -225,8 +225,6 @@ class Matrix
      return Base::_set(other);
    }
    /* Here, doxygen failed to copy the brief information when using \copydoc */
    /**
      * \brief Copies the generic expression \a other into *this.
      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
@@ -284,7 +282,15 @@ class Matrix
 #endif
 #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
+    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
     *
     * \only_for_vectors
     *
     * This constructor is for 1D array or vectors with more than 4 coefficients.
     * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
     *
     * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
     * constructor must match the the fixed number of rows (resp. columns) of \c *this.
     *
     * Example: \include Matrix_variadic_ctor_cxx11.cpp
     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
@@ -297,6 +303,8 @@ class Matrix
      : Base(a0, a1, a2, a3, args...) {}
    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
      *
      * \anchor matrix_constructor_initializer_list
      *
      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
      *
@@ -480,16 +488,21 @@ class Matrix
 #define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
 #define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
 /** \ingroup matrixtypedefs */                                    \
 /** \brief \noop            */                                    \
 typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
 #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
@@ -206,28 +206,22 @@ template<typename Derived> class MatrixBase
    EIGEN_DEVICE_FUNC
    DiagonalReturnType diagonal();
-    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    typedef Diagonal<const Derived> ConstDiagonalReturnType;
    EIGEN_DEVICE_FUNC
-    ConstDiagonalReturnType diagonal() const;
+    const ConstDiagonalReturnType diagonal() const;
    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
    template<int Index>
    EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<Index>::Type diagonal();
+    Diagonal<Derived, Index> diagonal();
    template<int Index>
    EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
+    const Diagonal<const Derived, Index> diagonal() const;
    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
    EIGEN_DEVICE_FUNC
-    DiagonalDynamicIndexReturnType diagonal(Index index);
+    Diagonal<Derived, DynamicIndex> diagonal(Index index);
    EIGEN_DEVICE_FUNC
-    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+    const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
@@ -98,6 +98,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
 }
 }  // namespace numext
 // clang-format off
 /** \class NumTraits
  * \ingroup Core_Module
  *
@@ -109,45 +110,47 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
  *
  * The provided data consists of:
  * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \c Real is just a typedef to \a T. If \a T is \c std::complex<U> then \c Real
+  *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
  *     is a typedef to \a U.
  * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
-  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
+  *     \a T again. Note however that many Eigen functions such as `internal::sqrt` simply refuse to
  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
  *     only intended as a helper for code that needs to explicitly promote types.
-  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for \c std::complex<U>, Literal is defined as \c U.
+  * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for `std::complex<U>`,
  *     Literal is defined as \c U.
  *     Of course, this type must be fully compatible with \a T. In doubt, just use \a T here.
-  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+  * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
  *     this means, just use \a T here.
-  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
+  * \li An enum value \c IsComplex. It is equal to 1 if \a T is a \c std::complex
  *     type, and to 0 otherwise.
-  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
+  * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
  *     and to \c 0 otherwise.
-  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
+  * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed
  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
-  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+  * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
-  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
+  * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">std::numeric_limits::epsilon()</a>,
+  * \li An `epsilon()` function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">`std::numeric_limits::epsilon()`</a>,
-  *     it returns a \a Real instead of a \a T.
+  *     it returns a \c Real instead of a \a T.
-  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
+  * \li A `dummy_precision()` function returning a weak epsilon value. It is mainly used as a default
  *     value by the fuzzy comparison operators.
-  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+  * \li `highest()` and `lowest()` functions returning the highest and lowest possible values respectively.
-  * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
+  * \li `digits()` function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point). This is
  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">std::numeric_limits<T>::digits</a>
  *     which is used as the default implementation if specialized.
-  * \li digits10() function returning the number of decimal digits that can be represented without change. This is
+  * \li `digits10()` function returning the number of decimal digits that can be represented without change. This is
  *     the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">std::numeric_limits<T>::digits10</a>
  *     which is used as the default implementation if specialized.
-  * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+  * \li `min_exponent()` and `max_exponent()` functions returning the highest and lowest possible values, respectively,
  *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent to
-  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">std::numeric_limits<T>::min_exponent</a>/
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">`std::numeric_limits<T>::min_exponent`</a>/
-  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">std::numeric_limits<T>::max_exponent</a>.
+  *     <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">`std::numeric_limits<T>::max_exponent`</a>.
-  * \li infinity() function returning a representation of positive infinity, if available.
+  * \li `infinity()` function returning a representation of positive infinity, if available.
-  * \li quiet_NaN function returning a non-signaling "not-a-number", if available.
+  * \li `quiet_NaN` function returning a non-signaling "not-a-number", if available.
  */
  // clang-format on
 template<typename T> struct GenericNumTraits
 {
@@ -245,12 +248,25 @@ template<> struct NumTraits<double> : GenericNumTraits<double>
  static inline double dummy_precision() { return 1e-12; }
 };
 // GPU devices treat `long double` as `double`.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct NumTraits<long double>
  : GenericNumTraits<long double>
 {
-  EIGEN_CONSTEXPR
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
-  static inline long double dummy_precision() { return 1e-15l; }
+  static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
 #if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
  // PowerPC double double causes issues with some values
  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
  static inline long double epsilon()
  {
    // 2^(-(__LDBL_MANT_DIG__)+1)
    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
  }
 #endif
 };
 #endif
 template<typename _Real> struct NumTraits<std::complex<_Real> >
  : GenericNumTraits<std::complex<_Real> >
@@ -54,12 +54,17 @@ struct packetwise_redux_traits
 /* Value to be returned when size==0 , by default let's return 0 */
 template<typename PacketType,typename Func>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+PacketType packetwise_redux_empty_value(const Func& ) {
  const typename unpacket_traits<PacketType>::type zero(0);
  return pset1<PacketType>(zero);
 }
 /* For products the default is 1 */
 template<typename PacketType,typename Scalar>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) {
  return pset1<PacketType>(Scalar(1));
 }
 /* Perform the actual reduction */
 template<typename Func, typename Evaluator,
@@ -312,8 +312,8 @@ template<typename PlainObjectType, int Options, typename StrideType> class Ref
    inline Ref(DenseBase<Derived>& expr)
    #endif
    {
-      EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+      EIGEN_STATIC_ASSERT((static_cast<bool>(internal::is_lvalue<Derived>::value)), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+      EIGEN_STATIC_ASSERT((static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime)), STORAGE_LAYOUT_DOES_NOT_MATCH);
      EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
      // Construction must pass since we will not create temporary storage in the non-const case.
      const bool success = Base::construct(expr.const_cast_derived());
@@ -250,7 +250,7 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
    inline Index outerStride() const
    {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+      return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
    }
  protected:
@@ -110,7 +110,7 @@ class SolverBase : public EigenBase<Derived>
    }
    /** \internal the return type of transpose() */
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
    /** \returns an expression of the transposed of the factored matrix.
      *
      * A typical usage is to solve for the transposed problem A^T x = b:
@@ -118,15 +118,15 @@ class SolverBase : public EigenBase<Derived>
      *
      * \sa adjoint(), solve()
      */
-    inline ConstTransposeReturnType transpose() const
+    inline const ConstTransposeReturnType transpose() const
    {
      return ConstTransposeReturnType(derived());
    }
    /** \internal the return type of adjoint() */
    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+               CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const ConstTransposeReturnType>,
-                        ConstTransposeReturnType
+               const ConstTransposeReturnType
            >::type AdjointReturnType;
    /** \returns an expression of the adjoint of the factored matrix
      *
@@ -137,7 +137,7 @@ class SolverBase : public EigenBase<Derived>
      *
      * \sa transpose(), solve()
      */
-    inline AdjointReturnType adjoint() const
+    inline const AdjointReturnType adjoint() const
    {
      return AdjointReturnType(derived().transpose());
    }
@@ -38,10 +38,14 @@ namespace Eigen {
  * \include Map_general_stride.cpp
  * Output: \verbinclude Map_general_stride.out
  *
-  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
  * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
  * not allowed).
  *
  * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
  * the inner stride is the pointer increment between two consecutive elements,
  * regardless of storage layout.
  *
  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
  */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
@@ -178,7 +178,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Transpose<Derived>
+typename DenseBase<Derived>::TransposeReturnType
 DenseBase<Derived>::transpose()
 {
  return TransposeReturnType(derived());
@@ -191,7 +191,7 @@ DenseBase<Derived>::transpose()
  * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename DenseBase<Derived>::ConstTransposeReturnType
+const typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
  return ConstTransposeReturnType(derived());
@@ -100,12 +100,10 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
      return coeffRef(row,col);
    }
    #ifndef EIGEN_PARSED_BY_DOXYGEN
    EIGEN_DEVICE_FUNC
    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
    EIGEN_DEVICE_FUNC
    inline Derived& derived() { return *static_cast<Derived*>(this); }
    #endif // not EIGEN_PARSED_BY_DOXYGEN
    template<typename DenseDerived>
    EIGEN_DEVICE_FUNC
@@ -442,7 +440,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    EIGEN_DEVICE_FUNC
    TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    EIGEN_DEVICE_FUNC
    TriangularViewType& operator=(const TriangularViewImpl& other)
    { return *this = other.derived().nestedExpression(); }
@@ -456,7 +453,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    /** \deprecated */
    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
    void lazyAssign(const MatrixBase<OtherDerived>& other);
 #endif
    /** Efficient triangular matrix times vector/matrix product */
    template<typename OtherDerived>
@@ -524,11 +520,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    /** Swaps the coefficients of the common triangular parts of two matrices */
    template<typename OtherDerived>
    EIGEN_DEVICE_FUNC
 #ifdef EIGEN_PARSED_BY_DOXYGEN
    void swap(TriangularBase<OtherDerived> &other)
 #else
    void swap(TriangularBase<OtherDerived> const & other)
 #endif
    {
      EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
      call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
@@ -552,9 +544,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
      this->solveInPlace(dst);
    }
-    template<typename ProductType>
+    template <typename ProductType>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha,
-    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+                                                                           bool beta);
  protected:
    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
@@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+  const float re = std::real(from);
  const float im = std::imag(from);
  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
@@ -167,15 +169,12 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const P
                         Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
 {
-  Packet4cf num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m256 tmp = _mm256_mul_ps(b.v, b.v);
  __m256 tmp2    = _mm256_shuffle_ps(tmp,tmp,0xB1);
  __m256 denom = _mm256_add_ps(tmp, tmp2);
  return Packet4cf(_mm256_div_ps(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x)
@@ -321,10 +320,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
 {
-  Packet2cd num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m256d tmp = _mm256_mul_pd(b.v, b.v);
  __m256d denom = _mm256_hadd_pd(tmp, tmp);
  return Packet2cd(_mm256_div_pd(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x)
@@ -285,11 +285,13 @@ template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
-  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
  return _mm256_xor_ps(a, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
 {
-  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
  return _mm256_xor_pd(a, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
@@ -628,11 +630,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d&
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
 #ifdef EIGEN_VECTORIZE_AVX512
  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
 #else
  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
  mask = por<Packet8i>(mask, bit_mask);
  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
+#if EIGEN_COMP_MSVC
  // MSVC sometimes seems to use a bogus mask with maskstore.
  const __m256i ifrom = _mm256_castps_si256(from);
  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast<char*>(to));
  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast<char*>(to + 4));
 #else
  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
 #endif
 #endif
 }
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
@@ -1006,7 +1020,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
 #else
  EIGEN_ALIGN32 float aux[8];
  pstore(aux, a);
@@ -37,7 +37,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
-    HasSqrt   = 1,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
 {
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+  const float re = std::real(from);
  const float im = std::imag(from);
  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
 }
 template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
@@ -157,11 +159,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
 template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
 {
-  Packet8cf num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m512 tmp = _mm512_mul_ps(b.v, b.v);
  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
  __m512 denom = _mm512_add_ps(tmp, tmp2);
  return Packet8cf(_mm512_div_ps(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
@@ -192,7 +190,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasMul    = 1,
    HasDiv    = 1,
    HasNegate = 1,
-    HasSqrt   = 1,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
    HasAbs    = 0,
    HasAbs2   = 0,
    HasMin    = 0,
@@ -253,11 +251,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<do
 template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
 {
  #ifdef EIGEN_VECTORIZE_AVX512DQ
  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
  #else
  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
  #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
@@ -309,47 +303,11 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const
                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
 }
 template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return internal::pmul(a, pconj(b));
  }
 };
 template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return internal::pmul(pconj(a), b);
  }
 };
 template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
 {
  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
  { return padd(pmul(x,y),c); }
  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
  {
    return pconj(internal::pmul(a, b));
  }
 };
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
 template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
 {
-  Packet4cd num = pmul(a, pconj(b));
+  return pdiv_complex(a, b);
  __m512d tmp = _mm512_mul_pd(b.v, b.v);
  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
  return Packet4cd(_mm512_div_pd(num.v, denom));
 }
 template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
@@ -408,6 +366,8 @@ ptranspose(PacketBlock<Packet4cd,4>& kernel) {
  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
 }
 #if EIGEN_HAS_AVX512_MATH
 template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
  return psqrt_complex<Packet4cd>(a);
 }
@@ -416,6 +376,8 @@ template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
  return psqrt_complex<Packet8cf>(a);
 }
 #endif
 } // end namespace internal
 } // end namespace Eigen
@@ -14,8 +14,7 @@ namespace Eigen {
 namespace internal {
-// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
+#if EIGEN_HAS_AVX512_MATH
 #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
  const Packet16f p16f_##NAME = pset1<Packet16f>(X)
@@ -326,7 +325,7 @@ Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
-#endif
+#endif  // EIGEN_HAS_AVX512_MATH
 template <>
@@ -28,6 +28,13 @@ namespace internal {
 #endif
 #endif
 // Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
 #if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
 #define EIGEN_HAS_AVX512_MATH 1
 #else
 #define EIGEN_HAS_AVX512_MATH 0
 #endif
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
@@ -72,12 +79,14 @@ struct packet_traits<half> : default_packet_traits {
    HasMax    = 1,
    HasConj   = 1,
    HasSetLinear = 0,
-    HasLog    = 1,
+    HasLog    = EIGEN_HAS_AVX512_MATH,
-    HasLog1p  = 1,
+    HasLog1p  = EIGEN_HAS_AVX512_MATH,
-    HasExpm1  = 1,
+    HasExp    = EIGEN_HAS_AVX512_MATH,
-    HasExp    = 1,
+    HasExpm1  = EIGEN_HAS_AVX512_MATH,
-    HasSqrt   = 1,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
-    HasRsqrt  = 1,
+    HasRsqrt  = EIGEN_HAS_AVX512_MATH,
    HasBessel = EIGEN_HAS_AVX512_MATH,
    HasNdtri  = EIGEN_HAS_AVX512_MATH,
    HasSin    = EIGEN_FAST_MATH,
    HasCos    = EIGEN_FAST_MATH,
    HasTanh   = EIGEN_FAST_MATH,
@@ -86,9 +95,7 @@ struct packet_traits<half> : default_packet_traits {
    HasRound  = 1,
    HasFloor  = 1,
    HasCeil   = 1,
-    HasRint   = 1,
+    HasRint   = 1
    HasBessel = 1,
    HasNdtri  = 1
  };
 };
@@ -109,7 +116,7 @@ template<> struct packet_traits<float>  : default_packet_traits
    HasBlend = 0,
    HasSin = EIGEN_FAST_MATH,
    HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
    HasLog = 1,
    HasLog1p  = 1,
    HasExpm1  = 1,
@@ -138,7 +145,7 @@ template<> struct packet_traits<double> : default_packet_traits
    AlignedOnScalar = 1,
    size = 8,
    HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
    HasLog  = 1,
    HasExp = 1,
    HasSqrt = EIGEN_FAST_MATH,
@@ -289,11 +296,20 @@ EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
-  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
+  // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
  //       The intel docs give it a relatively high latency as well, so we're probably
  //       better off with using _mm512_set_epi32 directly anyways.
  const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000,
                                        0x80000000,0x80000000,0x80000000,0x80000000,
                                        0x80000000,0x80000000,0x80000000,0x80000000,
                                        0x80000000,0x80000000,0x80000000,0x80000000);
  return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
+  const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
                                        0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
  return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 template <>
@@ -1426,60 +1442,11 @@ ploadquad(const Eigen::half* from) {
 }
 EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
 #ifdef EIGEN_HAS_FP16_C
  return _mm512_cvtph_ps(a);
 #else
  EIGEN_ALIGN64 half aux[16];
  pstore(aux, a);
  float f0(aux[0]);
  float f1(aux[1]);
  float f2(aux[2]);
  float f3(aux[3]);
  float f4(aux[4]);
  float f5(aux[5]);
  float f6(aux[6]);
  float f7(aux[7]);
  float f8(aux[8]);
  float f9(aux[9]);
  float fa(aux[10]);
  float fb(aux[11]);
  float fc(aux[12]);
  float fd(aux[13]);
  float fe(aux[14]);
  float ff(aux[15]);
  return _mm512_set_ps(
      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
 #endif
 }
 EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
 #ifdef EIGEN_HAS_FP16_C
  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
 #else
  EIGEN_ALIGN64 float aux[16];
  pstore(aux, a);
  half h0(aux[0]);
  half h1(aux[1]);
  half h2(aux[2]);
  half h3(aux[3]);
  half h4(aux[4]);
  half h5(aux[5]);
  half h6(aux[6]);
  half h7(aux[7]);
  half h8(aux[8]);
  half h9(aux[9]);
  half ha(aux[10]);
  half hb(aux[11]);
  half hc(aux[12]);
  half hd(aux[13]);
  half he(aux[14]);
  half hf(aux[15]);
  return _mm256_set_epi16(
      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
@@ -1852,7 +1819,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
    HasInsert = 1,
    HasSin = EIGEN_FAST_MATH,
    HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
 #ifdef EIGEN_VECTORIZE_AVX512DQ
    HasLog = 1,  // Currently fails test with bad accuracy.
    HasLog1p  = 1,
@@ -15,8 +15,10 @@ namespace Eigen {
 namespace internal {
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+inline Packet4ui  p4ui_CONJ_XOR() {
-#ifdef __VSX__
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
 }
 #ifdef EIGEN_VECTORIZE_VSX
 #if defined(_BIG_ENDIAN)
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -44,7 +46,7 @@ struct Packet2cf
    v1 = vec_madd(v1, b.v, p4f_ZERO);
    // multiply a_im * b and get the conjugate result
    v2 = vec_madd(v2, b.v, p4f_ZERO);
-    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR)));
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
    // permute back to a proper order
    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
@@ -100,7 +102,8 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
-#ifdef __VSX__
+    HasSqrt   = 1,
 #ifdef EIGEN_VECTORIZE_VSX
    HasBlend  = 1,
 #endif
    HasSetLinear = 0
@@ -127,20 +130,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
  Packet4f res0, res1;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
 #ifdef _BIG_ENDIAN
  __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #else
  __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #endif
 #else
-  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
+  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
-  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
 #endif
  return Packet2cf(res0);
@@ -164,7 +167,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); }
@@ -210,10 +213,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet2cf res = pmul(a, pconj(b));
  Packet4f s = pmul<Packet4f>(b.v, b.v);
  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@@ -233,21 +233,21 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe
  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
 }
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
  Packet2cf result;
  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
  return result;
 }
 #endif
 template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a)
 {
  return psqrt_complex<Packet2cf>(a);
 }
 #endif
 //---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 struct Packet1cd
 {
  EIGEN_STRONG_INLINE Packet1cd() {}
@@ -320,6 +320,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
    HasAbs2   = 0,
    HasMin    = 0,
    HasMax    = 0,
    HasSqrt   = 1,
    HasSetLinear = 0
  };
 };
@@ -375,10 +376,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet1cd res = pmul(a,pconj(b));
  Packet2d s = pmul<Packet2d>(b.v, b.v);
  return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
@@ -409,7 +407,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a)
  return psqrt_complex<Packet1cd>(a);
 }
-#endif // __VSX__
+#endif // EIGEN_VECTORIZE_VSX
 } // end namespace internal
 } // end namespace Eigen
@@ -40,16 +40,14 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
  return pcos_float(_x);
 }
 #ifdef EIGEN_VECTORIZE_VSX
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_rsqrt(x);
 }
 #endif
 #ifdef __VSX__
 #ifndef EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x)
 {
@@ -57,7 +55,7 @@ Packet2d prsqrt<Packet2d>(const Packet2d& x)
 }
 #endif
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
  return  vec_sqrt(x);
@@ -69,12 +67,43 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
  return  vec_sqrt(x);
 }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+#if !EIGEN_COMP_CLANG
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f prsqrt<Packet4f>(const Packet4f& x)
 {
  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
 //  vec_rsqrt returns different results from the generic version
 //  return  vec_rsqrt(x);
 }
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d prsqrt<Packet2d>(const Packet2d& x)
 {
  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 //  vec_rsqrt returns different results from the generic version
 //  return  vec_rsqrt(x);
 }
 #endif
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
  return pexp_double(_x);
 }
-#endif
+
 template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
 #endif  // EIGEN_VECTORIZE_VSX
 // Hyperbolic Tangent function.
 template <>
@@ -9,22 +9,8 @@ namespace Eigen {
 namespace internal {
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
 EIGEN_STRONG_INLINE void gemm_extra_col(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index row,
  Index col,
  Index remaining_rows,
  Index remaining_cols,
  const Packet& pAlpha);
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_extra_row(
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
@@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
  const Packet& pAlpha,
  const Packet& pMask);
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_unrolled_col(
+EIGEN_STRONG_INLINE void gemm_extra_cols(
  const DataMapper& res,
-  const Scalar* lhs_base,
+  const Scalar* blockA,
-  const Scalar* rhs_base,
+  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
-  Index& row,
+  Index strideB,
-  Index rows,
+  Index offsetB,
  Index col,
-  Index remaining_cols,
+  Index rows,
-  const Packet& pAlpha);
+  Index cols,
  Index remaining_rows,
  const Packet& pAlpha,
  const Packet& pMask);
 template<typename Packet>
 EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index row,
  Index col,
  Index remaining_rows,
  Index remaining_cols,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag);
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex_extra_row(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
@@ -91,123 +64,88 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row(
  const Packet& pMask);
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
  const DataMapper& res,
-  const Scalar* lhs_base,
+  const Scalar* blockA,
-  const Scalar* rhs_base,
+  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
-  Index& row,
+  Index offsetB,
  Index rows,
  Index col,
-  Index remaining_cols,
+  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlphaReal,
-  const Packet& pAlphaImag);
+  const Packet& pAlphaImag,
  const Packet& pMask);
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index col);
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
+template<typename Packet, int N>
-EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
 template<typename Packet>
 EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
 template<typename Packet, int N>
 EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
 const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
                                                     16, 17, 18, 19,
                                                      4,  5,  6,  7,
                                                     20, 21, 22, 23};
 const static Packet16uc p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
                                                      24, 25, 26, 27,
                                                      12, 13, 14, 15,
                                                      28, 29, 30, 31};
 //[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
 const static Packet16uc p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
                                                     16, 17, 18, 19, 20, 21, 22, 23};
 //[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
 const static Packet16uc p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
                                                      24, 25, 26, 27, 28, 29, 30, 31};
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc>
+template<typename Packet, typename Packetc, int N>
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
+  acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
+  if (N > 1) {
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
+    acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
+  }
  if (N > 2) {
    acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
  }
  if (N > 3) {
    acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
  }
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
+  acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
+  if (N > 1) {
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
+    acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
+  }
  if (N > 2) {
    acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
  }
  if (N > 3) {
    acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
  }
 }
-template<typename Packet, typename Packetc>
+template<typename Packet, typename Packetc, int N>
-EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+  bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);
  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
  if (N > 1) {
    acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
  }
  if (N > 2) {
    acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
  }
  if (N > 3) {
    acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
  }
-  acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
+  acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
-  acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
+  if (N > 1) {
-  acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
+    acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
-  acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
+  }
-}
+  if (N > 2) {
-
+    acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
-template<typename Packet, typename Packetc>
+  }
-EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+  if (N > 3) {
-{
+    acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
+  }
  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
 }
 template<typename Packet, typename Packetc>
 EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
 {
  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
  acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
 }
 template<>
 EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
 {
  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
 }
 template<>
 EIGEN_ALWAYS_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
 {
  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
 }
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
@@ -11,7 +11,11 @@
 #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
-#pragma GCC target("cpu=power10")
+// If using dynamic dispatch, set the CPU target.
 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
 #pragma GCC push_options
 #pragma GCC target("cpu=power10,htm")
 #endif
 #ifdef __has_builtin
 #if !__has_builtin(__builtin_vsx_assemble_pair)
@@ -30,37 +34,37 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
 }
 template<typename DataMapper, typename Index, typename Packet, const Index accCols>
-EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
 {
  PacketBlock<Packet, 4> result;
  __builtin_mma_disassemble_acc(&result.packet, acc);
  PacketBlock<Packet, 4> tRes;
-  bload<DataMapper, Packet, Index, accCols, 0, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);
-  bscale<Packet>(tRes, result, alpha);
+  bscale<Packet, 4>(tRes, result, alpha);
-  data.template storePacketBlock<Packet, 4>(i, j, tRes);
+  data.template storePacketBlock<Packet, 4>(i, 0, tRes);
 }
-template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC, int N>
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
-EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
 {
  PacketBlock<Packet, 4> resultReal, resultImag;
  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
  PacketBlock<Packetc, 8> tRes;
-  bload<DataMapper, Packetc, Index, accColsC, N, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);
  PacketBlock<Packet,4> taccReal, taccImag;
  bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
  PacketBlock<Packetc, 4> acc1, acc2;
-  bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+  bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);
-  data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
+  data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
-  data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
+  data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
 }
 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
@@ -125,7 +129,7 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag
 template<typename Scalar, typename Packet>
 EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
 {
-  rhsV = ploadRhs<Scalar, Packet>((const Scalar*)(rhs));
+  rhsV = ploadRhs<Scalar, Packet>(rhs);
 } 
 template<>
@@ -184,12 +188,11 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
  }
 #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
  MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
  MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
-  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);
  MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9);
 #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0; \
@@ -222,7 +225,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
 #define MICRO_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
-    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
  }
@@ -238,21 +241,19 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
 #define MICRO_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
  }
 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index& row,
  Index col,
  const Packet& pAlpha)
 {
  const Scalar* rhs_ptr = rhs_base;
@@ -278,11 +279,84 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
  row += unroll_factor*accCols;
 }
 template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_ALWAYS_INLINE void gemmMMA_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlpha,
  const Packet& pMask)
 {
  const DataMapper res3 = res.getSubMapper(0, col);
  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
  const Scalar* lhs_base = blockA + accCols*offsetA;
  Index row = 0;
 #define MAX_MMA_UNROLL 7
  while(row + MAX_MMA_UNROLL*accCols <= rows) {
    gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
  }
  switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
    case 7:
      gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 6
    case 6:
      gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 5
    case 5:
      gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 4
    case 4:
      gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 3
    case 3:
      gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 2
    case 2:
      gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
 #if MAX_MMA_UNROLL > 1
    case 1:
      gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
      break;
 #endif
    default:
      break;
  }
 #undef MAX_MMA_UNROLL
  if(remaining_rows > 0)
  {
    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
  }
 }
 template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
 void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
      const Index remaining_cols = cols % accRows;
      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
@@ -293,79 +367,10 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
-        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+        gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
        const Scalar* lhs_base = blockA;
        Index row = 0;
 #define MAX_MMA_UNROLL 7
        while(row + MAX_MMA_UNROLL*accCols <= rows) {
          gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
        }
        switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
          case 7:
            gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 6
          case 6:
            gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 5
          case 5:
            gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 4
          case 4:
            gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 3
          case 3:
            gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 2
          case 2:
            gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
 #if MAX_MMA_UNROLL > 1
          case 1:
            gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
            break;
 #endif
          default:
            break;
        }
 #undef MAX_MMA_UNROLL
        if(remaining_rows > 0)
        {
          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
        }
      }
-      if(remaining_cols > 0)
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
      {
        const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
        const Scalar* lhs_base = blockA;
        for(; col < cols; col++)
        {
          Index row = 0;
          gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
          if (remaining_rows > 0)
          {
            gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
          }
          rhs_base++;
        }
      }
 }
 #define accColsC (accCols / 2)
@@ -373,21 +378,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define advanceCols ((RhsIsReal) ? 1 : 2)
 // PEEL_COMPLEX_MMA loop factor.
-#define PEEL_COMPLEX_MMA 7
+#define PEEL_COMPLEX_MMA 3
 #define MICRO_COMPLEX_MMA_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4)
+  func(0) func(1) func(2) func(3)
 #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
  if (unroll_factor > iter) { \
    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
    lhs_ptr_real##iter += accCols; \
    if(!LhsIsReal) { \
-      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
      lhs_ptr_imag##iter += accCols; \
    } else { \
      EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
    } \
    lhs_ptr_real##iter += accCols; \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
    EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
@@ -400,8 +404,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
  if (PEEL_COMPLEX_MMA > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
    ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
    if(!RhsIsReal) { \
      ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
@@ -409,20 +413,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
    } \
    MICRO_COMPLEX_MMA_UNROLL(func2); \
-    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
  } else { \
    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
  }
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3; \
-  type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \
  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9);
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
  type rhsV0, rhsVi0; \
@@ -459,15 +460,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
  if (unroll_factor > iter) { \
-    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
    if(!LhsIsReal) { \
      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
    } else { \
      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
    } \
  } else { \
    EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
  }
 #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
@@ -475,45 +470,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
  if (unroll_factor > iter) { \
    EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
    if(!LhsIsReal) { \
      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
    } \
  }
 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
  if (unroll_factor > iter) { \
-    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC, 0>(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
  }
 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
  const DataMapper& res,
  const Scalar* lhs_base,
  const Scalar* rhs_base,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index& row,
  Index col,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag)
 {
  const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
+  const Scalar* rhs_ptr_imag = NULL;
  const Index imag_delta = accCols*strideA;
  if(!RhsIsReal) {
    rhs_ptr_imag = rhs_base + accRows*strideB;
  } else {
    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
  }
-  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
-  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
-  const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4;
  MICRO_COMPLEX_MMA_SRC_PTR
  MICRO_COMPLEX_MMA_DST_PTR
@@ -537,11 +527,70 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
  row += unroll_factor*accCols;
 }
 template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
  const DataMapper& res,
  const Scalar* blockA,
  const Scalar* blockB,
  Index depth,
  Index strideA,
  Index offsetA,
  Index strideB,
  Index offsetB,
  Index col,
  Index rows,
  Index cols,
  Index remaining_rows,
  const Packet& pAlphaReal,
  const Packet& pAlphaImag,
  const Packet& pMask)
 {
  const DataMapper res3 = res.getSubMapper(0, col);
  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
  const Scalar* lhs_base = blockA + accCols*offsetA;
  Index row = 0;
 #define MAX_COMPLEX_MMA_UNROLL 4
  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
    gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
  }
  switch( (rows-row)/accCols ) {
 #if MAX_COMPLEX_MMA_UNROLL > 4
    case 4:
      gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 3
    case 3:
      gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 2
    case 2:
      gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 1
    case 1:
      gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
      break;
 #endif
    default:
      break;
  }
 #undef MAX_COMPLEX_MMA_UNROLL
  if(remaining_rows > 0)
  {
    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
  }
 }
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
      const Index remaining_rows = rows % accCols;
      const Index remaining_cols = cols % accRows;
      if( strideA == -1 ) strideA = depth;
      if( strideB == -1 ) strideB = depth;
@@ -556,74 +605,23 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS
      Index col = 0;
      for(; col + accRows <= cols; col += accRows)
      {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
        const Scalar* lhs_base = blockA;
        Index row = 0;
 #define MAX_COMPLEX_MMA_UNROLL 4
        while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
          gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
        }
        switch( (rows-row)/accCols ) {
 #if MAX_COMPLEX_MMA_UNROLL > 4
          case 4:
            gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
            break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 3
          case 3:
            gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
            break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 2
          case 2:
            gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
            break;
 #endif
 #if MAX_COMPLEX_MMA_UNROLL > 1
          case 1:
            gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
            break;
 #endif
          default:
            break;
        }
 #undef MAX_COMPLEX_MMA_UNROLL
        if(remaining_rows > 0)
        {
          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
        }
      }
-      if(remaining_cols > 0)
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
      {
        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
        const Scalar* lhs_base = blockA;
        for(; col < cols; col++)
        {
          Index row = 0;
          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
          if (remaining_rows > 0)
          {
            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
          }
          rhs_base++;
        }
      }
 }
 #undef accColsC
 #undef advanceRows
 #undef advanceCols
 #pragma GCC reset_options
 } // end namespace internal
 } // end namespace Eigen
 #if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
 #pragma GCC pop_options
 #endif
 #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
@@ -84,7 +84,7 @@ static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
 static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
 static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
-#ifndef __VSX__
+#ifndef EIGEN_VECTORIZE_VSX
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
@@ -114,7 +114,7 @@ static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3
 // Define global static constants:
 #ifdef _BIG_ENDIAN
 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 #endif
 static Packet16uc p16uc_PSET32_WODD   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
@@ -168,13 +168,16 @@ struct packet_traits<float> : default_packet_traits {
    HasCos = EIGEN_FAST_MATH,
    HasLog = 1,
    HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
    HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
 #else
    HasRsqrt = 0,
 #endif
    HasTanh = EIGEN_FAST_MATH,
    HasErf = EIGEN_FAST_MATH,
    HasRint = 1,
 #else
    HasSqrt = 0,
    HasRsqrt = 0,
@@ -184,7 +187,6 @@ struct packet_traits<float> : default_packet_traits {
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
    HasRint = 1,
    HasNegate = 1,
    HasBlend = 1
  };
@@ -210,23 +212,24 @@ struct packet_traits<bfloat16> : default_packet_traits {
    HasCos = EIGEN_FAST_MATH,
    HasLog = 1,
    HasExp = 1,
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
    HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
 #else
    HasRsqrt = 0,
 #endif
    HasRint = 1,
 #else
    HasSqrt = 0,
    HasRsqrt = 0,
-    HasTanh = EIGEN_FAST_MATH,
+    HasRint = 0,
    HasErf = EIGEN_FAST_MATH,
 #endif
    HasTanh = 0,
    HasErf = 0,
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
    HasRint = 1,
    HasNegate = 1,
    HasBlend = 1
  };
@@ -432,7 +435,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
  // ignoring these warnings for now.
  EIGEN_UNUSED_VARIABLE(from);
  EIGEN_DEBUG_ALIGNED_LOAD
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
  return vec_ld(0, from);
@@ -481,7 +484,7 @@ EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet
  // ignoring these warnings for now.
  EIGEN_UNUSED_VARIABLE(to);
  EIGEN_DEBUG_ALIGNED_STORE
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
  vec_xst(from, 0, to);
 #else
  vec_st(from, 0, to);
@@ -786,8 +789,22 @@ template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a,
 template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+{
 #ifdef __POWER8_VECTOR__
  return vec_neg(a);
 #else
  return vec_xor(a, p4f_MZERO);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
 {
 #ifdef __POWER8_VECTOR__
  return vec_neg(a);
 #else
  return p4i_ZERO - a;
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -802,7 +819,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a,
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-#ifndef __VSX__  // VSX actually provides a div instruction
+#ifndef EIGEN_VECTORIZE_VSX  // VSX actually provides a div instruction
  Packet4f t, y_0, y_1;
  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
@@ -831,7 +848,7 @@ template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  #ifdef __VSX__
+  #ifdef EIGEN_VECTORIZE_VSX
  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -849,7 +866,7 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a,
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
-  #ifdef __VSX__
+  #ifdef EIGEN_VECTORIZE_VSX
  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
  Packet4f ret;
  __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
@@ -865,26 +882,39 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, con
 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
 // To fix bug with vec_cmplt on older versions
 #if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
  return vec_nor(c,c);
 }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
 #endif
 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
@@ -923,7 +953,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
    Packet4f res;
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
    __asm__("xvrspiz %x0, %x1\n\t"
        : "=&wa" (res)
        : "wa" (t));
@@ -937,6 +967,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
 {
    Packet4f res;
@@ -947,21 +978,19 @@ template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
    return res;
 }
 #endif
 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from)
 {
-  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
-#ifdef _BIG_ENDIAN
+#ifdef EIGEN_VECTORIZE_VSX
-  Packet16uc MSQ, LSQ;
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
-  Packet16uc mask;
+#else
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
+  Packet16uc mask = vec_lvsl(0, from);                 // create the permute mask
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
+  Packet16uc MSQ = vec_ld(0, (unsigned char *)from);   // most significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
+  Packet16uc LSQ = vec_ld(15, (unsigned char *)from);  // least significant quadword
  //TODO: Add static_cast here
  return (Packet) vec_perm(MSQ, LSQ, mask);            // align the data
 #else
  EIGEN_DEBUG_UNALIGNED_LOAD
  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #endif
 }
@@ -1066,7 +1095,9 @@ template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned ch
 template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)*  to, const Packet& from)
 {
  EIGEN_DEBUG_UNALIGNED_STORE
-#ifdef _BIG_ENDIAN
+#ifdef EIGEN_VECTORIZE_VSX
  vec_xst(from, 0, to);
 #else
  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
  // Warning: not thread safe!
  Packet16uc MSQ, LSQ, edges;
@@ -1081,8 +1112,6 @@ template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE_
  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
  vec_st( MSQ, 0, (unsigned char *)to );                   // Store the MSQ part second
 #else
  vec_xst(from, 0, to);
 #endif
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
@@ -1341,16 +1370,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, con
  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
 }
 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
 }
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
  return pldexp_generic(a,exponent);
 }
@@ -1390,9 +1409,11 @@ template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){
 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
 }
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet8bf print<Packet8bf> (const Packet8bf& a){
  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
 }
 #endif
 template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
  Packet4f a_even = Bf16ToF32Even(a);
  Packet4f a_odd = Bf16ToF32Odd(a);
@@ -2252,7 +2273,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Pa
 //---------- double ----------
-#ifdef __VSX__
+#ifdef EIGEN_VECTORIZE_VSX
 typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
@@ -2304,7 +2325,11 @@ template<> struct packet_traits<double> : default_packet_traits
    HasLog  = 0,
    HasExp  = 1,
    HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
    HasRsqrt = 1,
 #else
    HasRsqrt = 0,
 #endif
    HasRound = 1,
    HasFloor = 1,
    HasCeil = 1,
@@ -2393,7 +2418,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
 {
 #ifdef __POWER8_VECTOR__
  return vec_neg(a);
 #else
  return vec_xor(a, p2d_MZERO);
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
@@ -2703,7 +2735,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
 }
-#endif // __VSX__
+#endif // EIGEN_VECTORIZE_VSX
 } // end namespace internal
 } // end namespace Eigen
@@ -11,13 +11,24 @@
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
 // clang-format off
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, GCC and older versions of clang do
 // not treat them as device functions and thus Eigen functors making use of
 // these operators fail to compile. Here, we manually specialize these
 // operators and functors for complex types when building for CUDA to enable
 // their use on-device.
 //
 // NOTES:
 //  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
 //    since they are already specialized in the standard. Using them will result
 //    in silent kernel failures.
 //  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
 //    to duplicate definition errors, since these are already specialized in
 //    Visual Studio's <complex> header (contrary to the standard).  This is
 //    preferable to removing such definitions, which will lead to silent kernel
 //    failures.
 //  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
 //    to the first inclusion of <complex>.
 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
@@ -251,12 +251,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const
    output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
    return output;
  }
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
+  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
 #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
  output.value = p[0];
 #else
  output.value = p[1];
 #endif
  return output;
 }
@@ -462,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
-    float result = 0;
+    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
    unsigned short* q = reinterpret_cast<unsigned short*>(&result);
 #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
    q[0] = h.value;
 #else
    q[1] = h.value;
 #endif
    return result;
 }
 // --- standard functions ---
@@ -642,10 +642,10 @@ Packet psincos_float(const Packet& _x)
  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
  // using "Extended precision modular arithmetic"
-  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
+  #if defined(EIGEN_VECTORIZE_FMA)
-  // This version requires true FMA for high accuracy
+  // This version requires true FMA for high accuracy.
  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
@@ -757,6 +757,26 @@ Packet pcos_float(const Packet& x)
  return psincos_float<false>(x);
 }
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 EIGEN_UNUSED Packet pdiv_complex(const Packet& x, const Packet& y) {
  typedef typename unpacket_traits<Packet>::as_real RealPacket;
  // In the following we annotate the code for the case where the inputs
  // are a pair length-2 SIMD vectors representing a single pair of complex
  // numbers x = a + i*b, y = c + i*d.
  const RealPacket y_abs = pabs(y.v);  // |c|, |d|
  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v; // |d|, |c|
  const RealPacket y_max = pmax(y_abs, y_abs_flip); // max(|c|, |d|), max(|c|, |d|)
  const RealPacket y_scaled = pdiv(y.v, y_max);  // c / max(|c|, |d|), d / max(|c|, |d|)
  // Compute scaled denominator.
  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled); // c'**2, d'**2
  const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
  Packet result_scaled = pmul(x, pconj(Packet(y_scaled)));  // a * c' + b * d', -a * d + b * c
  // Divide elementwise by denom.
  result_scaled = Packet(pdiv(result_scaled.v, denom));
  // Rescale result
  return Packet(pdiv(result_scaled.v, y_max));
 }
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -895,7 +915,7 @@ void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
  s_lo = psub(y, t);
 }
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#ifdef EIGEN_VECTORIZE_FMA
 // This function implements the extended precision product of
 // a pair of floating point numbers. Given {x, y}, it computes the pair
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
@@ -946,7 +966,7 @@ void twoprod(const Packet& x, const Packet& y,
  p_lo = pmadd(x_lo, y_lo, p_lo);
 }
-#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif  // EIGEN_VECTORIZE_FMA
 // This function implements Dekker's algorithm for the addition
@@ -1443,21 +1463,22 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
 }
 // Generic implementation of pow(x,y).
-template<typename Packet>
+template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
 EIGEN_UNUSED
 Packet generic_pow(const Packet& x, const Packet& y) {
  typedef typename unpacket_traits<Packet>::type Scalar;
  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
  const Packet cst_neg_inf = pset1<Packet>(-NumTraits<Scalar>::infinity());
  const Packet cst_zero = pset1<Packet>(Scalar(0));
  const Packet cst_one = pset1<Packet>(Scalar(1));
  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
  const Packet abs_x = pabs(x);
  // Predicates for sign and magnitude of x.
-  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
-  const Packet x_is_neg = pcmp_lt(x, cst_zero);
+  const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
  const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
  const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
  const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
@@ -1467,15 +1488,15 @@ Packet generic_pow(const Packet& x, const Packet& y) {
  const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
  // Predicates for sign and magnitude of y.
  const Packet abs_y = pabs(y);
  const Packet y_is_one = pcmp_eq(y, cst_one);
-  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero);
  const Packet y_is_neg = pcmp_lt(y, cst_zero);
-  const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg));
  const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
-  const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
+  const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf);
  EIGEN_CONSTEXPR Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
       NumTraits<Scalar>::epsilon();
  const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
  // Predicates for whether y is integer and/or even.
@@ -1484,39 +1505,31 @@ Packet generic_pow(const Packet& x, const Packet& y) {
  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
  // Predicates encoding special cases for the value of pow(x,y)
-  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
+  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
                                                    y_is_int),
                                            abs_y_is_inf);
  const Packet pow_is_one = por(por(x_is_one, y_is_zero),
                                pand(x_is_neg_one,
                                     por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
  const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
-  const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
+  const Packet pow_is_one =
-                                         pand(abs_x_is_inf, y_is_neg)),
+      por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
-                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+  const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
-                                          y_is_pos)),
+                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
-                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
-                                      y_is_neg));
+  const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
-  const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
+                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
-                                        pand(abs_x_is_inf, y_is_pos)),
+                                pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
-                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge),
+  const Packet inf_val =
-                                         y_is_neg)),
+      pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
-                                pand(pand(abs_x_is_gt_one, abs_y_is_huge),
+              cst_neg_inf, cst_pos_inf);
                                     y_is_pos));
  // General computation of pow(x,y) for positive x or negative x and integer y.
  const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
  const Packet pow_abs = generic_pow_impl(abs_x, y);
-  return pselect(y_is_one, x,
+  return pselect(
      y_is_one, x,
      pselect(pow_is_one, cst_one,
              pselect(pow_is_nan, cst_nan,
-                                 pselect(pow_is_inf, cst_pos_inf,
+                      pselect(pow_is_inf, inf_val,
-                                         pselect(pow_is_zero, cst_zero,
+                              pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
                                                 pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
 }
 /* polevl (modified for Eigen)
 *
 *      Evaluate polynomial
@@ -101,6 +101,12 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 EIGEN_UNUSED
 Packet psqrt_complex(const Packet& a);
 /** \internal \returns x / y for complex types */
 template<typename Packet>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 EIGEN_UNUSED
 Packet pdiv_complex(const Packet& x, const Packet& y);
 template <typename Packet, int N> struct ppolevl;
@@ -36,8 +36,6 @@
 #ifndef EIGEN_HALF_H
 #define EIGEN_HALF_H
 #include <sstream>
 #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
 // When compiling with GPU support, the "__half_raw" base class as well as
 // some other routines are defined in the GPU compiler header files
@@ -334,7 +332,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
 }
 #endif
-#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
  return half(vaddh_f16(a.x, b.x));
 }
@@ -534,7 +532,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 #elif defined(EIGEN_HAS_FP16_C)
  __half_raw h;
  #if EIGEN_COMP_MSVC
    // MSVC does not have scalar instructions.
    h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
  #else
    h.x = _cvtss_sh(ff, 0);
  #endif
  return h;
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
@@ -595,7 +598,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
  return __half2float(h);
 #elif defined(EIGEN_HAS_FP16_C)
  #if EIGEN_COMP_MSVC
    // MSVC does not have scalar instructions.
    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
  #else
    return _cvtsh_ss(h.x);
  #endif
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
  return static_cast<float>(h.x);
 #else
@@ -121,7 +121,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 namespace {
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
                                                        const float& b) {
@@ -180,8 +179,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
 }
 }  // namespace
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
                                                          const float4& b) {
@@ -493,9 +490,10 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
-// its corresponding packet_traits<Eigen::half> must be visible on host.
+// on device. There is no benefit to using them on the host anyways, since they are
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+// emulated.
 #if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 typedef ulonglong2 Packet4h2;
 template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
@@ -526,42 +524,9 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
  };
 };
 namespace {
 // This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
  return __halves2half2(a, b);
 #else
  // Round-about way since __halves2half2 is a __device__ function.
  return __floats2half2_rn(__half2float(a), __half2float(b));
 #endif
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
  return __low2half(a);
 #else
  return __float2half(__low2float(a));
 #endif
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
  return __high2half(a);
 #else
  return __float2half(__high2float(a));
 #endif
 }
 } // namespace
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
  return __half2half2(from);
 #else
  const float f = __half2float(from);
  return __floats2half2_rn(f, f);
 #endif
 }
 template <>
@@ -576,8 +541,6 @@ pset1<Packet4h2>(const Eigen::half& from) {
  return r;
 }
 // We now need this visible on both host and device.
 // #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 namespace {
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -585,11 +548,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
-  return combine_half(from[0], from[1]);
+  return __halves2half2(from[0], from[1]);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
-  return combine_half(from[0], from[0]);
+  return __halves2half2(from[0], from[0]);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
@@ -599,8 +562,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
                                                   const half2& from) {
-  to[0] = get_half2_low(from);
+  to[0] = __low2half(from);
-  to[1] = get_half2_high(from);
+  to[1] = __high2half(from);
 }
@@ -610,7 +573,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
  // Input is guaranteed to be properly aligned.
  return __ldg(reinterpret_cast<const half2*>(from));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
@@ -619,31 +582,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
 #if defined(EIGEN_GPU_HAS_LDG)
  return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
                                                    Index stride) {
-  return combine_half(from[0*stride], from[1*stride]);
+  return __halves2half2(from[0*stride], from[1*stride]);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
    Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = get_half2_low(from);
+  to[stride*0] = __low2half(from);
-  to[stride*1] = get_half2_high(from);
+  to[stride*1] = __high2half(from);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
-  return get_half2_low(a);
+  return __low2half(a);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
@@ -658,12 +621,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = get_half2_low(kernel.packet[0]);
+  __half a1 = __low2half(kernel.packet[0]);
-  __half a2 = get_half2_high(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
-  __half b1 = get_half2_low(kernel.packet[1]);
+  __half b1 = __low2half(kernel.packet[1]);
-  __half b2 = get_half2_high(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
-  kernel.packet[0] = combine_half(a1, b1);
+  kernel.packet[0] = __halves2half2(a1, b1);
-  kernel.packet[1] = combine_half(a2, b2);
+  kernel.packet[1] = __halves2half2(a2, b2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
@@ -671,88 +634,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
  float f = __half2float(a) + 1.0f;
-  return combine_half(a, __float2half(f));
+  return __halves2half2(a, __float2half(f));
 #endif
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
                                                    const half2& a,
                                                    const half2& b) {
-  half mask_low = get_half2_low(mask);
+  half mask_low = __low2half(mask);
-  half mask_high = get_half2_high(mask);
+  half mask_high = __high2half(mask);
-  half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
-  half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
-  return combine_half(result_low, result_high);
+  return __halves2half2(result_low, result_high);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
                                                    const half2& b) {
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
                                                    const half2& b) {
  half true_half = half_impl::raw_uint16_to_half(0xffffu);
  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
                                                 const half2& b) {
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
                                                const half2& b) {
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
                                                 const half2& b) {
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
                                                    const half2& b) {
-  half a1 = get_half2_low(a);
+  half a1 = __low2half(a);
-  half a2 = get_half2_high(a);
+  half a2 = __high2half(a);
-  half b1 = get_half2_low(b);
+  half b1 = __low2half(b);
-  half b2 = get_half2_high(b);
+  half b2 = __high2half(b);
  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
@@ -851,9 +814,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return combine_half(r1, r2);
+  return __halves2half2(r1, r2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
@@ -862,9 +825,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return combine_half(r1, r2);
+  return __halves2half2(r1, r2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
@@ -885,7 +848,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
 }
@@ -897,7 +860,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
 }
@@ -1068,10 +1031,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
  Packet4h2 r;
  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
-  p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
-  p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
-  p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
  return r;
 }
@@ -1152,12 +1115,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose_half(half2& f0, half2& f1) {
-  __half a1 = get_half2_low(f0);
+  __half a1 = __low2half(f0);
-  __half a2 = get_half2_high(f0);
+  __half a2 = __high2half(f0);
-  __half b1 = get_half2_low(f1);
+  __half b1 = __low2half(f1);
-  __half b2 = get_half2_high(f1);
+  __half b2 = __high2half(f1);
-  f0 = combine_half(a1, b1);
+  f0 = __halves2half2(a1, b1);
-  f1 = combine_half(a2, b2);
+  f1 = __halves2half2(a2, b2);
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
@@ -1254,10 +1217,10 @@ plset<Packet4h2>(const Eigen::half& a) {
  float f = __half2float(a);
  Packet4h2 r;
  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(a, __float2half(f + 1.0f));
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
  return r;
 #endif
 }
@@ -1477,9 +1440,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
    const Packet4h2& a) {
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_max(a_alias[0]),
+  half2 m0 = __halves2half2(predux_max(a_alias[0]),
                            predux_max(a_alias[1]));
-  half2 m1 = combine_half(predux_max(a_alias[2]),
+  half2 m1 = __halves2half2(predux_max(a_alias[2]),
                            predux_max(a_alias[3]));
  __half first  = predux_max(m0);
  __half second = predux_max(m1);
@@ -1496,9 +1459,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
    const Packet4h2& a) {
  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_min(a_alias[0]),
+  half2 m0 = __halves2half2(predux_min(a_alias[0]),
                            predux_min(a_alias[1]));
-  half2 m1 = combine_half(predux_min(a_alias[2]),
+  half2 m1 = __halves2half2(predux_min(a_alias[2]),
                            predux_min(a_alias[3]));
  __half first  = predux_min(m0);
  __half second = predux_min(m1);
@@ -1652,9 +1615,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return combine_half(r1, r2);
+  return __halves2half2(r1, r2);
 }
 template<>
@@ -1664,14 +1627,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
  float a2 = __high2float(a);
  float b1 = __low2float(b);
  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return combine_half(r1, r2);
+  return __halves2half2(r1, r2);
 }
-// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 #endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
 #undef EIGEN_GPU_HAS_LDG
 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
@@ -17,7 +17,6 @@ namespace internal {
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 template <>
 struct type_casting_traits<Eigen::half, float> {
  enum {
@@ -75,15 +75,12 @@ struct Packet2cf {
  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
    return Packet2cf(*this) -= b;
  }
  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
    *this *= b.conjugate();
    Packet4f s = pmul<Packet4f>(b.v, b.v);
    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
    v = pdiv(v, s);
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
-    return Packet2cf(*this) /= b;
+    return pdiv_complex(Packet2cf(*this), b);
  }
  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
    *this = Packet2cf(*this) / b;
    return *this;
  }
  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
    return Packet2cf(pnegate(v));
@@ -129,12 +129,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa
 template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
 {
-  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
@@ -347,27 +347,11 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
 {
-  // TODO optimize it for NEON
+  return pdiv_complex(a, b);
  Packet1cf res = pmul(a, pconj(b));
  Packet2f s, rev_s;
  // this computes the norm
  s = vmul_f32(b.v, b.v);
  rev_s = vrev64_f32(s);
  return Packet1cf(pdiv<Packet2f>(res.v, vadd_f32(s, rev_s)));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for NEON
+  return pdiv_complex(a, b);
  Packet2cf res = pmul(a,pconj(b));
  Packet4f s, rev_s;
  // this computes the norm
  s = vmulq_f32(b.v, b.v);
  rev_s = vrev64q_f32(s);
  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s, rev_s)));
 }
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
@@ -389,13 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
 //---------- double ----------
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
-// See bug 1325, clang fails to call vld1q_u64.
+inline uint64x2_t p2ul_CONJ_XOR() {
-#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  static const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
-  static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+  return vld1q_u64(p2ul_conj_XOR_DATA);
-#else
+}
  const uint64_t  p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 };
  static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );
 #endif
 struct Packet1cd
 {
@@ -465,7 +446,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
 { return Packet1cd(pnegate<Packet2d>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR()))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
@@ -480,7 +461,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
  // Multiply the imag a with b
  v2 = vmulq_f64(v2, b.v);
  // Conjugate v2
-  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR()));
  // Swap real/imag elements in v2.
  v2 = preverse<Packet2d>(v2);
  // Add and return the result
@@ -553,12 +534,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for NEON
+  return pdiv_complex(a, b);
  Packet1cd res = pmul(a,pconj(b));
  Packet2d s = pmul<Packet2d>(b.v, b.v);
  Packet2d rev_s = preverse<Packet2d>(s);
  return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s)));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
@@ -24,7 +24,7 @@ struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
  template <typename LaneIdType>
  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
-                                Packet4f& c, Packet4f& tmp,
+                                Packet4f& c, Packet4f&,
                                const LaneIdType&) const {
    acc(a, b, c);
  }
@@ -57,6 +57,16 @@ typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
 typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
 typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
  float from[4] = {a, b, c, d};
  return vld1q_f32(from);
 }
 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
  float from[2] = {a, b};
  return vld1_f32(from);
 }
 #else
 typedef float32x2_t                          Packet2f;
@@ -78,11 +88,22 @@ typedef uint32x4_t                           Packet4ui;
 typedef int64x2_t                            Packet2l;
 typedef uint64x2_t                           Packet2ul;
 EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
  const Packet2f low = {a, b};
  const Packet2f high = {c, d};
  return vcombine_f32(low, high);
 }
 EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
  const Packet2f result = {a, b};
  return result;
 }
 #endif // EIGEN_COMP_MSVC_STRICT
 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
  const float* a = reinterpret_cast<const float*>(&m);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
  return res;
 }
@@ -95,7 +116,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int
 {
  const float* a = reinterpret_cast<const float*>(&m);
  const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
  return res;
 }
@@ -104,7 +125,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n
 {
  const float* a = reinterpret_cast<const float*>(&m);
  const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
  return res;
 }
@@ -135,7 +156,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
  return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
 }
 #define vec4f_duplane(a, p) \
-  vdupq_lane_f32(vget_low_f32(a), p)
+  Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -146,7 +167,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
  // __builtin_prefetch tends to do nothing on ARM64 compilers because the
  // prefetch instructions there are too detailed for __builtin_prefetch to map
  // meaningfully to them.
@@ -155,7 +176,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif EIGEN_ARCH_ARM32
+#elif EIGEN_ARCH_ARM
  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
  // by default no explicit prefetching
@@ -862,12 +883,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, con
 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
 template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
-  Packet2f mask = {numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
  return padd(a, pxor(mask, b));
 }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  Packet4f mask = {numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f};
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
  return padd(a, pxor(mask, b));
 }
@@ -947,57 +968,6 @@ template<> EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, con
    vdup_n_u64(vgetq_lane_u64(a, 1)*vgetq_lane_u64(b, 1)));
 }
 template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
 {
 #if EIGEN_ARCH_ARM64
  return vdiv_f32(a,b);
 #else
  Packet2f inv, restep, div;
  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
  // Newton-Raphson and vrecpsq_f32()
  inv = vrecpe_f32(b);
  // This returns a differential, by which we will have to multiply inv to get a better
  // approximation of 1/b.
  restep = vrecps_f32(b, inv);
  inv = vmul_f32(restep, inv);
  // Finally, multiply a by 1/b and get the wanted result of the division.
  div = vmul_f32(a, inv);
  return div;
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #if EIGEN_ARCH_ARM64
  return vdivq_f32(a,b);
 #else
  Packet4f inv, restep, div;
  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
  // Newton-Raphson and vrecpsq_f32()
  inv = vrecpeq_f32(b);
  // This returns a differential, by which we will have to multiply inv to get a better
  // approximation of 1/b.
  restep = vrecpsq_f32(b, inv);
  inv = vmulq_f32(restep, inv);
  // Finally, multiply a by 1/b and get the wanted result of the division.
  div = vmulq_f32(a, inv);
  return div;
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
 {
  eigen_assert(false && "packet integer division are not supported by NEON");
@@ -1079,12 +1049,15 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/,
  return pset1<Packet2ul>(0ULL);
 }
-
+#ifdef EIGEN_VECTORIZE_FMA
-#ifdef __ARM_FEATURE_FMA
+template <>
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
-{ return vfmaq_f32(c,a,b); }
+  return vfmaq_f32(c, a, b);
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
+}
-{ return vfma_f32(c,a,b); }
+template <>
 EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
  return vfma_f32(c, a, b);
 }
 #else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
 {
@@ -2499,7 +2472,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
 { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
@@ -2513,7 +2486,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
 {
  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
@@ -2527,7 +2500,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
 {
  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
@@ -2563,11 +2536,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
 { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
 { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
 template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
 { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
 template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
@@ -3180,7 +3153,7 @@ template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
  return padd(tmp, mask);
 }
-#endif
+#endif  // EIGEN_ARCH_ARMV8
 /**
 * Computes the integer square root
@@ -3273,40 +3246,115 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
  return res;
 }
-template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
  // Compute approximate reciprocal sqrt.
-  Packet4f x = vrsqrteq_f32(a);
+  // Does not correctly handle +/- 0 or +inf
-  // Do Newton iterations for 1/sqrt(x).
+  float32x4_t result = vrsqrteq_f32(a);
-  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
-  x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, x), x), x);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
-  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+  return result;
-  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+}
 EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
  // Compute approximate reciprocal sqrt.
  // Does not correctly handle +/- 0 or +inf
  float32x2_t result = vrsqrte_f32(a);
  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
  return result;
 }
 template<typename Packet> Packet prsqrt_float_common(const Packet& a) {
  const Packet cst_zero = pzero(a);
  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
  Packet return_zero = pcmp_eq(a, cst_inf);
  Packet return_inf = pcmp_eq(a, cst_zero);
  Packet result = prsqrt_float_unsafe(a);
  result = pselect(return_inf, por(cst_inf, a), result);
  result = pandnot(result, return_zero);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
  return prsqrt_float_common(a);
 }
 template<> EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
-  // Compute approximate reciprocal sqrt.
+  return prsqrt_float_common(a);
-  Packet2f x = vrsqrte_f32(a);
+}
-  // Do Newton iterations for 1/sqrt(x).
+
-  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+EIGEN_STRONG_INLINE Packet4f preciprocal(const Packet4f& a)
-  x = vmul_f32(vrsqrts_f32(vmul_f32(a, x), x), x);
+{
-  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  // Compute approximate reciprocal.
-  return pselect(pcmp_eq(a, pzero(a)), infinity, x);
+  float32x4_t result = vrecpeq_f32(a);
  result = vmulq_f32(vrecpsq_f32(a, result), result);
  result = vmulq_f32(vrecpsq_f32(a, result), result);
  return result;
 }
 EIGEN_STRONG_INLINE Packet2f preciprocal(const Packet2f& a)
 {
  // Compute approximate reciprocal.
  float32x2_t result = vrecpe_f32(a);
  result = vmul_f32(vrecps_f32(a, result), result);
  result = vmul_f32(vrecps_f32(a, result), result);
  return result;
 }
 // Unfortunately vsqrt_f32 is only available for A64.
 #if EIGEN_ARCH_ARM64
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& _x){return vsqrtq_f32(_x);}
+template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { return vsqrtq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& _x){return vsqrt_f32(_x); }
+
 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) { return vsqrt_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return vdivq_f32(a, b); }
 template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) { return vdiv_f32(a, b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+template<typename Packet>
-  const Packet4f infinity = pset1<Packet4f>(NumTraits<float>::infinity());
+EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
-  const Packet4f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+  const Packet cst_zero = pzero(a);
-  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
  Packet result = pmul(a, prsqrt_float_unsafe(a));  
  Packet a_is_zero = pcmp_eq(a, cst_zero);
  Packet a_is_inf = pcmp_eq(a, cst_inf);
  Packet return_a = por(a_is_zero, a_is_inf);
  result = pselect(return_a, a, result);
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
  return psqrt_float_common(a);
 }
 template<> EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
-  const Packet2f infinity = pset1<Packet2f>(NumTraits<float>::infinity());
+  return psqrt_float_common(a);
-  const Packet2f is_zero_or_inf = por(pcmp_eq(a, pzero(a)), pcmp_eq(a, infinity));
+}
-  return pselect(is_zero_or_inf, a, pmul(a, prsqrt(a)));
+
 template<typename Packet>
 EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
  // avoid underflow with the following manipulation:
  // a / b = f * (a * reciprocal(f * b))
  const Packet cst_one = pset1<Packet>(1.0f);
  const Packet cst_quarter = pset1<Packet>(0.25f);
  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
  return result;
 }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
  return pdiv_float_common(a, b);
 }
 template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
  return pdiv_float_common(a, b);
 }
 #endif
@@ -3388,7 +3436,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
 {
  // See the scalar implemention in BFloat16.h for a comprehensible explanation
  // of this fast rounding algorithm
-  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
  // lsb = (input >> 16) & 1
  Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
@@ -3413,7 +3461,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
 {
-  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
 }
 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
@@ -3421,21 +3469,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
 }
 template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
-  return pset1<Packet4us>(from.value);
+  return Packet4bf(pset1<Packet4us>(from.value));
 }
 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
-  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
 {
-  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
 {
-  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
@@ -3450,7 +3498,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet
 template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
 {
-  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
@@ -3497,25 +3545,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
-  return por<Packet4us>(a, b);
+  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
-  return pxor<Packet4us>(a, b);
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
-  return pand<Packet4us>(a, b);
+  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
-  return pandnot<Packet4us>(a, b);
+  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
                                                      const Packet4bf& b)
 {
-  return pselect<Packet4us>(mask, a, b);
+  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
 }
 template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
@@ -3554,13 +3602,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, con
 template<>
 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
 {
-  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
 }
 template<>
 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
 {
-  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
 }
 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
@@ -3585,7 +3633,7 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a
 template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
 {
-  return preverse<Packet4us>(a);
+  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
 }
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
@@ -3620,7 +3668,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a,
 template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 {
-  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
 }
 //---------- double ----------
@@ -3638,17 +3686,35 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 #if EIGEN_COMP_GNUC
 // Bug 907: workaround missing declarations of the following two functions in the ADK
 // Defining these functions as templates ensures that if these intrinsics are
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
 // This doesn't work with MSVC though, since the function names are macros.
 template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
 template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
 #endif
 #if EIGEN_COMP_MSVC_STRICT
 typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
 typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
  double from[2] = {a, b};
  return vld1q_f64(from);
 }
 #else
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
 EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
  double from[2] = {a, b};
  return vld1q_f64(from);
 }
 #endif
 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
 // for fast inversion of matrices of size 4.
@@ -3656,7 +3722,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m
 {
  const double* a = reinterpret_cast<const double*>(&m);
  const double* b = reinterpret_cast<const double*>(&n);
-  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
  return res;
 }
@@ -3673,7 +3739,7 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
  return shuffle(a, b, 3);
 }
 #define vec2d_duplane(a, p) \
-  vdupq_laneq_f64(a, p)
+  Packet2d(vdupq_laneq_f64(a, p))
 template<> struct packet_traits<double>  : default_packet_traits
 {
@@ -3747,7 +3813,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
-  const Packet2d mask = {numext::bit_cast<double>(0x8000000000000000ull),0.0};
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
  return padd(a, pxor(mask, b));
 }
@@ -3759,7 +3825,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 // See bug 936. See above comment about FMA for float.
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
 { return vfmaq_f64(c,a,b); }
@@ -3862,7 +3928,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
 #endif
 // min
@@ -3918,7 +3984,7 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
 template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
-#endif // EIGEN_ARCH_ARM64
+#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 // Do we have an fp16 types and supporting Neon intrinsics?
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
@@ -15,6 +15,113 @@ namespace Eigen {
 namespace internal {
 //==============================================================================
 // preinterpret
 //==============================================================================
 template <>
 EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
  return Packet2f(vreinterpret_f32_s32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
  return Packet2f(vreinterpret_f32_u32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
  return Packet4f(vreinterpretq_f32_s32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
  return Packet4f(vreinterpretq_f32_u32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
  return static_cast<Packet4c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
  return Packet8c(vreinterpret_s8_u8(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
  return Packet16c(vreinterpretq_s8_u8(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
  return static_cast<Packet4uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
  return Packet8uc(vreinterpret_u8_s8(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
  return Packet16uc(vreinterpretq_u8_s8(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
  return Packet4s(vreinterpret_s16_u16(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
  return Packet8s(vreinterpretq_s16_u16(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
  return Packet4us(vreinterpret_u16_s16(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
  return Packet8us(vreinterpretq_u16_s16(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
  return Packet2i(vreinterpret_s32_f32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
  return Packet2i(vreinterpret_s32_u32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
  return Packet4i(vreinterpretq_s32_f32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
  return Packet4i(vreinterpretq_s32_u32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
  return Packet2ui(vreinterpret_u32_f32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
  return Packet2ui(vreinterpret_u32_s32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
  return Packet4ui(vreinterpretq_u32_f32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
  return Packet4ui(vreinterpretq_u32_s32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
  return Packet2l(vreinterpretq_s64_u64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
  return Packet2ul(vreinterpretq_u64_s64(a));
 }
 //==============================================================================
 // pcast, SrcType = float
 //==============================================================================
@@ -188,7 +295,7 @@ struct type_casting_traits<numext::int8_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
-  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet16c, Packet2l>(a));
 }
 template <>
@@ -212,11 +319,11 @@ struct type_casting_traits<numext::int8_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
-  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet16c, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
-  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet8c, Packet2i>(a));
 }
 template <>
@@ -240,11 +347,11 @@ struct type_casting_traits<numext::int8_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
-  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+  return preinterpret<Packet8us>(pcast<Packet16c, Packet8s>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
-  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+  return preinterpret<Packet4us>(pcast<Packet8c, Packet4s>(a));
 }
 template <>
@@ -270,11 +377,11 @@ struct type_casting_traits<numext::int8_t, numext::uint8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
+  return preinterpret<Packet16uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
+  return preinterpret<Packet8uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
@@ -315,7 +422,7 @@ struct type_casting_traits<numext::uint8_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
-  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet16uc, Packet2ul>(a));
 }
 template <>
@@ -339,11 +446,11 @@ struct type_casting_traits<numext::uint8_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
-  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet16uc, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
-  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet8uc, Packet2ui>(a));
 }
 template <>
@@ -367,11 +474,11 @@ struct type_casting_traits<numext::uint8_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
-  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+  return preinterpret<Packet8s>(pcast<Packet16uc, Packet8us>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
-  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+  return preinterpret<Packet4s>(pcast<Packet8uc, Packet4us>(a));
 }
 template <>
@@ -397,11 +504,11 @@ struct type_casting_traits<numext::uint8_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
+  return preinterpret<Packet16c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
+  return preinterpret<Packet8c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
@@ -442,7 +549,7 @@ struct type_casting_traits<numext::int16_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
-  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet8s, Packet2l>(a));
 }
 template <>
@@ -466,11 +573,11 @@ struct type_casting_traits<numext::int16_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
-  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet8s, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
-  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
 }
 template <>
@@ -492,11 +599,11 @@ struct type_casting_traits<numext::int16_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
+  return preinterpret<Packet8us>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
+  return preinterpret<Packet4us>(a);
 }
 template <>
@@ -559,7 +666,7 @@ struct type_casting_traits<numext::uint16_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
-  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet8us, Packet2ul>(a));
 }
 template <>
@@ -583,11 +690,11 @@ struct type_casting_traits<numext::uint16_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
-  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet8us, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
-  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
 }
 template <>
@@ -609,11 +716,11 @@ struct type_casting_traits<numext::uint16_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
+  return preinterpret<Packet8s>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
+  return preinterpret<Packet4s>(a);
 }
 template <>
@@ -635,11 +742,11 @@ struct type_casting_traits<numext::uint16_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
-  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+  return preinterpret<Packet16c>(pcast<Packet8us, Packet16uc>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
-  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+  return preinterpret<Packet8c>(pcast<Packet4us, Packet8uc>(a, b));
 }
 //==============================================================================
@@ -674,7 +781,7 @@ struct type_casting_traits<numext::int32_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
-  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet4i, Packet2l>(a));
 }
 template <>
@@ -696,11 +803,11 @@ struct type_casting_traits<numext::int32_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
+  return preinterpret<Packet4ui>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
+  return preinterpret<Packet2ui>(a);
 }
 template <>
@@ -799,7 +906,7 @@ struct type_casting_traits<numext::uint32_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
-  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet4ui, Packet2ul>(a));
 }
 template <>
@@ -821,11 +928,11 @@ struct type_casting_traits<numext::uint32_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
+  return preinterpret<Packet4i>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
+  return preinterpret<Packet2i>(a);
 }
 template <>
@@ -847,11 +954,11 @@ struct type_casting_traits<numext::uint32_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
-  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+  return preinterpret<Packet8s>(pcast<Packet4ui, Packet8us>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
-  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+  return preinterpret<Packet4s>(pcast<Packet2ui, Packet4us>(a, b));
 }
 template <>
@@ -880,12 +987,12 @@ struct type_casting_traits<numext::uint32_t, numext::int8_t> {
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                          const Packet4ui& d) {
-  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+  return preinterpret<Packet16c>(pcast<Packet4ui, Packet16uc>(a, b, c, d));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
                                                        const Packet2ui& d) {
-  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+  return preinterpret<Packet8c>(pcast<Packet2ui, Packet8uc>(a, b, c, d));
 }
 //==============================================================================
@@ -915,7 +1022,7 @@ struct type_casting_traits<numext::int64_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet2ul>(a);
 }
 template <>
@@ -1013,7 +1120,7 @@ struct type_casting_traits<numext::uint64_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
+  return preinterpret<Packet2l>(a);
 }
 template <>
@@ -1031,7 +1138,7 @@ struct type_casting_traits<numext::uint64_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
-  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+  return preinterpret<Packet4i>(pcast<Packet2ul, Packet4ui>(a, b));
 }
 template <>
@@ -1053,7 +1160,7 @@ struct type_casting_traits<numext::uint64_t, numext::int16_t> {
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                        const Packet2ul& d) {
-  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+  return preinterpret<Packet8s>(pcast<Packet2ul, Packet8us>(a, b, c, d));
 }
 template <>
@@ -1077,114 +1184,7 @@ template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
                                                          const Packet2ul& g, const Packet2ul& h) {
-  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
+  return preinterpret<Packet16c>(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
 }
 //==============================================================================
 // preinterpret
 //==============================================================================
 template <>
 EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
  return vreinterpret_f32_s32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
  return vreinterpret_f32_u32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
  return vreinterpretq_f32_s32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
  return vreinterpretq_f32_u32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
  return static_cast<Packet4c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
  return vreinterpret_s8_u8(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
  return vreinterpretq_s8_u8(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
  return static_cast<Packet4uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
  return vreinterpret_u8_s8(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
  return vreinterpretq_u8_s8(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
  return vreinterpret_s16_u16(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
  return vreinterpretq_s16_u16(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
  return vreinterpret_u16_s16(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
  return vreinterpretq_u16_s16(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
  return vreinterpret_s32_f32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
  return vreinterpret_s32_u32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
  return vreinterpretq_s32_f32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
  return vreinterpretq_s32_u32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
  return vreinterpret_u32_f32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
  return vreinterpret_u32_s32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
  return vreinterpretq_u32_f32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
  return vreinterpretq_u32_s32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
  return vreinterpretq_s64_u64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
  return vreinterpretq_u64_s64(a);
 }
 #if EIGEN_ARCH_ARM64
@@ -1193,6 +1193,31 @@ EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l&
 // pcast/preinterpret, Double
 //==============================================================================
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
  return Packet2d(vreinterpretq_f64_s64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
  return Packet2d(vreinterpretq_f64_u64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
  return Packet2l(vreinterpretq_s64_f64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
  return Packet2ul(vreinterpretq_u64_f64(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
  return Packet2d(vreinterpretq_f64_s32(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
  return Packet4i(vreinterpretq_s32_f64(a));
 }
 template <>
 struct type_casting_traits<double, double> {
  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
@@ -1314,7 +1339,9 @@ struct type_casting_traits<numext::int8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+  // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability.
  Packet2f tmp = pcast<Packet8c, Packet2f>(vget_low_s8(a));
  return vcvt_f64_f32(tmp);
 }
 template <>
@@ -1324,7 +1351,8 @@ struct type_casting_traits<numext::uint8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+  Packet2f tmp = pcast<Packet8uc, Packet2f>(vget_low_u8(a));
  return vcvt_f64_f32(tmp);
 }
 template <>
@@ -1334,7 +1362,8 @@ struct type_casting_traits<numext::int16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+  Packet2f tmp = pcast<Packet4s, Packet2f>(vget_low_s16(a));
  return vcvt_f64_f32(tmp);
 }
 template <>
@@ -1344,7 +1373,8 @@ struct type_casting_traits<numext::uint16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
  // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+  Packet2f tmp = pcast<Packet4us, Packet2f>(vget_low_u16(a));
  return vcvt_f64_f32(tmp);
 }
 template <>
@@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
  return vcvtq_f64_u64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
  return vreinterpretq_f64_s64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
  return vreinterpretq_f64_u64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
  return vreinterpretq_s64_f64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
  return vreinterpretq_u64_f64(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
  return vreinterpretq_f64_s32(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
  return vreinterpretq_s32_f64(a);
 }
 #endif  // EIGEN_ARCH_ARM64
 }  // end namespace internal
@@ -106,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  Packet2cf res;
+  const float re = std::real(from);
-#ifdef EIGEN_VECTORIZE_SSE3
+  const float im = std::imag(from);
-  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
+  return Packet2cf(_mm_set_ps(im, re, im, re));
 #else
  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
  res.v = _mm_movelh_ps(res.v, res.v);
 #endif
  return res;
 }
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
@@ -174,14 +169,9 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for SSE3 and 4
+  return pdiv_complex(a, b);
  Packet2cf res = pmul(a, pconj(b));
  __m128 s = _mm_mul_ps(b.v,b.v);
  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2))));
 }
 //---------- double ----------
 struct Packet1cd
 {
@@ -299,10 +289,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for SSE3 and 4
+  return pdiv_complex(a, b);
  Packet1cd res = pmul(a,pconj(b));
  __m128d s = _mm_mul_pd(b.v,b.v);
  return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x)
@@ -444,7 +444,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
  // There appears to be a bug in GCC, by which the optimizer may
  // flip the argument order in calls to _mm_min_ps, so we have to
  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -463,7 +463,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
  // There appears to be a bug in GCC, by which the optimizer may
  // flip the argument order in calls to _mm_min_pd, so we have to
  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -494,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
  // There appears to be a bug in GCC, by which the optimizer may
  // flip the argument order in calls to _mm_max_ps, so we have to
  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -513,7 +513,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
  // There appears to be a bug in GCC, by which the optimizer may
  // flip the argument order in calls to _mm_max_pd, so we have to
  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -16,7 +16,9 @@ namespace Eigen {
 namespace internal {
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+inline Packet4ui  p4ui_CONJ_XOR() {
  return { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
 }
 #endif
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
@@ -91,8 +93,18 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
  };
 };
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> {
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+  typedef std::complex<float>  type;
  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
  typedef Packet2cf half;
  typedef Packet4f as_real;
 };
 template<> struct unpacket_traits<Packet1cd> {
  typedef std::complex<double> type;
  enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
  typedef Packet1cd half;
  typedef Packet2d as_real;
 };
 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
@@ -150,7 +162,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
  pstore<std::complex<double> >(&res, a);
  return res;
@@ -169,10 +181,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet1cd res = pmul(a,pconj(b));
  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
 }
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
@@ -195,7 +204,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
  pstore<std::complex<float> >(res, a);
  return res[0];
@@ -225,14 +234,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
  pstore<std::complex<float> >((std::complex<float> *) af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -308,11 +317,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet2cf res;
  res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]);
  res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]);
  return res;
 }
 EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
@@ -342,7 +347,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packe
  Packet4f tmp = { eq[1], eq[0], eq[3], eq[2] };
  return (Packet2cf)pand<Packet4f>(eq, tmp);
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
  Packet4f a_re, a_im, prod, prod_im;
@@ -355,7 +360,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
  // multiply a_im * b and get the conjugate result
  prod_im = a_im * b.v;
-  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()));
  // permute back to a proper order
  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
@@ -394,10 +399,7 @@ EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // TODO optimize it for AltiVec
+  return pdiv_complex(a, b);
  Packet2cf res = pmul(a, pconj(b));
  Packet4f s = pmul<Packet4f>(b.v, b.v);
  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
@@ -91,8 +91,8 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 static Packet2d p2d_ONE = { 1.0, 1.0 };
-static Packet2d p2d_ZERO_ = { numext::bit_cast<double>0x8000000000000000ull),
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
-                              numext::bit_cast<double>0x8000000000000000ull) };
+                              numext::bit_cast<double>(0x8000000000000000ull) };
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
@@ -358,7 +358,7 @@ pbroadcast4<Packet2d>(const double *a,
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
@@ -368,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
 return pload<Packet2d>(af);
@@ -376,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
  pstore<int>((int *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
@@ -386,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
  pstore<double>(af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -460,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int    x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
@@ -639,7 +639,7 @@ pbroadcast4<Packet4f>(const float *a,
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
  ai[0] = from[0*stride];
  ai[1] = from[1*stride];
  ai[2] = from[2*stride];
@@ -649,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
  pstore<float>((float *)ai, from);
  to[0*stride] = ai[0];
  to[1*stride] = ai[1];
@@ -785,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
  return p;
 }
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -943,7 +943,7 @@ pbroadcast4<Packet4f>(const float *a,
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
  af[0] = from[0*stride];
  af[1] = from[1*stride];
  af[2] = from[2*stride];
@@ -953,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
  pstore<float>((float*)af, from);
  to[0*stride] = af[0];
  to[1*stride] = af[1];
@@ -978,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { r
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
@@ -2269,8 +2269,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
  bool gone_half = false, gone_quarter = false, gone_last = false;
  Index i = 0;
-  int pack = Pack1;
+  Index pack = Pack1;
-  int psize = PacketSize;
+  Index psize = PacketSize;
  while(pack>0)
  {
    Index remaining_rows = rows-i;
@@ -2290,21 +2290,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
          {
            if (psize == PacketSize) {
              PacketBlock<Packet> kernel;
-              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
              ptranspose(kernel);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
            } else if (HasHalf && psize == HalfPacketSize) {
              gone_half = true;
              PacketBlock<HalfPacket> kernel_half;
-              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
              ptranspose(kernel_half);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
            } else if (HasQuarter && psize == QuarterPacketSize) {
              gone_quarter = true;
              PacketBlock<QuarterPacket> kernel_quarter;
-              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
              ptranspose(kernel_quarter);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
 	    }
          }
          count += psize*pack;
@@ -59,9 +59,9 @@ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
-  const LhsScalar* _lhs, Index lhsStride,
+  const LhsScalar* lhs_, Index lhsStride,
-  const RhsScalar* _rhs, Index rhsStride,
+  const RhsScalar* rhs_, Index rhsStride,
-  ResScalar* _res, Index resIncr, Index resStride,
+  ResScalar* res_, Index resIncr, Index resStride,
  ResScalar alpha,
  level3_blocking<LhsScalar,RhsScalar>& blocking,
  GemmParallelInfo<Index>* info = 0)
@@ -69,9 +69,9 @@ static void run(Index rows, Index cols, Index depth,
  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
-  LhsMapper lhs(_lhs, lhsStride);
+  LhsMapper lhs(lhs_, lhsStride);
-  RhsMapper rhs(_rhs, rhsStride);
+  RhsMapper rhs(rhs_, rhsStride);
-  ResMapper res(_res, resStride, resIncr);
+  ResMapper res(res_, resStride, resIncr);
  Index kc = blocking.kc();                   // cache block size along the K direction
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -60,9 +60,9 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride,
+                                      const RhsScalar* rhs_, Index rhsStride,
-                                      ResScalar* _res, Index resIncr, Index resStride,
+                                      ResScalar* res_, Index resIncr, Index resStride,
                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
  {
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -70,9 +70,9 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
+    LhsMapper lhs(lhs_,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
    Index kc = blocking.kc();
    Index mc = (std::min)(size,blocking.mc());
@@ -113,7 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
               (std::min)(size,i2), alpha, -1, -1, 0, 0);
-        sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(res_+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
        if (UpLo==Upper)
        {
@@ -144,11 +144,11 @@ struct tribb_kernel
  enum {
    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
  };
-  void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
  {
    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
    gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
@@ -300,14 +300,19 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
  }
 };
-template<typename MatrixType, unsigned int UpLo>
+template <typename _MatrixType, unsigned int _Mode>
-template<typename ProductType>
+template <typename ProductType>
-EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename TriangularViewImpl<_MatrixType, _Mode, Dense>::TriangularViewType&
-{
+TriangularViewImpl<_MatrixType, _Mode, Dense>::_assignProduct(
-  EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+    const ProductType& prod, const typename TriangularViewImpl<_MatrixType, _Mode, Dense>::Scalar& alpha, bool beta) {
  EIGEN_STATIC_ASSERT((_Mode & UnitDiag) == 0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-  general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
+  general_product_to_triangular_selector<_MatrixType, ProductType, _Mode,
                                         internal::traits<ProductType>::InnerSize == 1>::run(derived()
                                                                                                 .nestedExpression()
                                                                                                 .const_cast_derived(),
                                                                                             prod, alpha, beta);
  return derived();
 }
@@ -359,6 +359,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
  };
  typedef typename make_unsigned<Index>::type UnsignedIndex;
  const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
  Index i=0;
  for(; i<n8; i+=8)
  {
@@ -371,8 +376,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
              c6 = pset1<ResPacket>(ResScalar(0)),
              c7 = pset1<ResPacket>(ResScalar(0));
-    Index j=0;
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
    {
      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@@ -393,7 +397,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
    ResScalar cc5 = predux(c5);
    ResScalar cc6 = predux(c6);
    ResScalar cc7 = predux(c7);
-    for(; j<cols; ++j)
+
    for (Index j = fullColBlockEnd; j < cols; ++j)
    {
      RhsScalar b0 = rhs(j,0);
@@ -422,8 +427,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
              c2 = pset1<ResPacket>(ResScalar(0)),
              c3 = pset1<ResPacket>(ResScalar(0));
-    Index j=0;
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
    {
      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@@ -436,7 +440,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
    ResScalar cc1 = predux(c1);
    ResScalar cc2 = predux(c2);
    ResScalar cc3 = predux(c3);
-    for(; j<cols; ++j)
+
    for(Index j = fullColBlockEnd; j < cols; ++j)
    {
      RhsScalar b0 = rhs(j,0);
@@ -455,8 +460,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
              c1 = pset1<ResPacket>(ResScalar(0));
-    Index j=0;
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
    {
      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
@@ -465,7 +469,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
    }
    ResScalar cc0 = predux(c0);
    ResScalar cc1 = predux(c1);
-    for(; j<cols; ++j)
+
    for(Index j = fullColBlockEnd; j < cols; ++j)
    {
      RhsScalar b0 = rhs(j,0);
@@ -480,15 +485,15 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
-    Index j=0;
+
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
    {
      RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
    }
    ResScalar cc0 = predux(c0);
    if (HasHalf) {
-      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+      for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf)
        {
          RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
          c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
@@ -496,14 +501,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
      cc0 += predux(c0_h);
    }
    if (HasQuarter) {
-      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+      for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter)
        {
          RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
          c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
        }
      cc0 += predux(c0_q);
    }
-    for(; j<cols; ++j)
+    for (Index j = quarterColBlockEnd; j < cols; ++j)
    {
      cc0 += cj.pmul(lhs(i,j), rhs(j,0));
    }
@@ -43,7 +43,7 @@ struct symm_pack_lhs
      for(Index w=0; w<BlockRows; w++)
        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
  }
-  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows)
  {
    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
@@ -53,7 +53,7 @@ struct symm_pack_lhs
           HasHalf = (int)HalfPacketSize < (int)PacketSize,
           HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
-    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(lhs_,lhsStride);
    Index count = 0;
    //Index peeled_mc3 = (rows/Pack1)*Pack1;
@@ -101,11 +101,11 @@ template<typename Scalar, typename Index, int nr, int StorageOrder>
 struct symm_pack_rhs
 {
  enum { PacketSize = packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2)
  {
    Index end_k = k2 + rows;
    Index count = 0;
-    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(rhs_,rhsStride);
    Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
    Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
@@ -330,8 +330,8 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
  static EIGEN_DONT_INLINE void run(
    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -342,9 +342,9 @@ template <typename Scalar, typename Index,
          int ResInnerStride>
 EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    Scalar* res_,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = rows;
@@ -355,10 +355,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
+    LhsMapper lhs(lhs_,lhsStride);
-    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
+    LhsTransposeMapper lhs_transpose(lhs_,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -425,8 +425,8 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
  static EIGEN_DONT_INLINE void run(
    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -437,9 +437,9 @@ template <typename Scalar, typename Index,
          int ResInnerStride>
 EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    Scalar* res_,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = cols;
@@ -448,8 +448,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
+    LhsMapper lhs(lhs_,lhsStride);
-    ResMapper res(_res,resStride, resIncr);
+    ResMapper res(res_,resStride, resIncr);
    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -466,7 +466,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    {
      const Index actual_kc = (std::min)(k2+kc,size)-k2;
-      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+      pack_rhs(blockB, rhs_, rhsStride, actual_kc, cols, k2);
      // => GEPP
      for(Index i2=0; i2<rows; i2+=mc)
@@ -18,10 +18,10 @@ namespace internal {
 // struct gemm_pack_lhs_triangular
 // {
 //   Matrix<Scalar,mr,mr,
-//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* lhs_, int lhsStride, int depth, int rows)
 //   {
 //     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(lhs_,lhsStride);
 //     int count = 0;
 //     const int peeled_mc = (rows/mr)*mr;
 //     for(int i=0; i<peeled_mc; i+=mr)
@@ -96,8 +96,8 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
  static EIGEN_DONT_INLINE void run(
    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -110,9 +110,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                                        LhsStorageOrder,ConjugateLhs,
                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    Scalar* res_,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    // strip zeros
@@ -124,9 +124,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
+    LhsMapper lhs(lhs_,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -254,8 +254,8 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
  static EIGEN_DONT_INLINE void run(
    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -268,9 +268,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                                        LhsStorageOrder,ConjugateLhs,
                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
+    const Scalar* lhs_, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* rhs_, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    Scalar* res_,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
@@ -283,9 +283,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
+    LhsMapper lhs(lhs_,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -26,30 +26,30 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
  };
-  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+  static EIGEN_DONT_INLINE  void run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
+                                     const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha);
 };
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+  ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
+        const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha)
  {
    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index size = (std::min)(_rows,_cols);
+    Index size = (std::min)(rows_,cols_);
-    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
+    Index rows = IsLower ? rows_ : (std::min)(rows_,cols_);
-    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+    Index cols = IsLower ? (std::min)(rows_,cols_) : cols_;
    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
    typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
-    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+    const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr));
    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
-    ResMap res(_res,rows);
+    ResMap res(res_,rows);
    typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
@@ -84,7 +84,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
          rows, cols-size,
          LhsMapper(&lhs.coeffRef(0,size), lhsStride),
          RhsMapper(&rhs.coeffRef(size), rhsIncr),
-          _res, resIncr, alpha);
+          res_, resIncr, alpha);
    }
  }
@@ -97,30 +97,30 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
  };
-  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+  static EIGEN_DONT_INLINE void run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
-                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+                                    const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha);
 };
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
+  ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+        const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha)
  {
    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index diagSize = (std::min)(_rows,_cols);
+    Index diagSize = (std::min)(rows_,cols_);
-    Index rows = IsLower ? _rows : diagSize;
+    Index rows = IsLower ? rows_ : diagSize;
-    Index cols = IsLower ? diagSize : _cols;
+    Index cols = IsLower ? diagSize : cols_;
    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
-    const RhsMap rhs(_rhs,cols);
+    const RhsMap rhs(rhs_,cols);
    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
    typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
-    ResMap res(_res,rows,InnerStride<>(resIncr));
+    ResMap res(res_,rows,InnerStride<>(resIncr));
    typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
@@ -50,18 +50,18 @@ struct triangular_matrix_vector_product_trmv :
 #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+                                     const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+        rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
  } \
 }; \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
+ static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+                                     const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+        rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
  } \
 };
@@ -81,23 +81,23 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
    LowUp = IsLower ? Lower : Upper \
  }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+                 const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
 { \
   if (ConjLhs || IsZeroDiag) { \
     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+       rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
     return; \
   }\
-   Index size = (std::min)(_rows,_cols); \
+   Index size = (std::min)(rows_,cols_); \
-   Index rows = IsLower ? _rows : size; \
+   Index rows = IsLower ? rows_ : size; \
-   Index cols = IsLower ? size : _cols; \
+   Index cols = IsLower ? size : cols_; \
 \
   typedef VectorX##EIGPREFIX VectorRhs; \
   EIGTYPE *x, *y;\
 \
 /* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
   VectorRhs x_tmp; \
   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
   x = x_tmp.data(); \
@@ -121,24 +121,24 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
-       y = _res + size*resIncr; \
+       y = res_ + size*resIncr; \
-       a = _lhs + size; \
+       a = lhs_ + size; \
       m = convert_index<BlasIndex>(rows-size); \
       n = convert_index<BlasIndex>(size); \
     } \
     else { \
       x += size; \
-       y = _res; \
+       y = res_; \
-       a = _lhs + size*lda; \
+       a = lhs_ + size*lda; \
       m = convert_index<BlasIndex>(size); \
       n = convert_index<BlasIndex>(cols-size); \
     } \
@@ -170,23 +170,23 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
    LowUp = IsLower ? Lower : Upper \
  }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
+ static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+                 const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
 { \
   if (IsZeroDiag) { \
     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+       rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
     return; \
   }\
-   Index size = (std::min)(_rows,_cols); \
+   Index size = (std::min)(rows_,cols_); \
-   Index rows = IsLower ? _rows : size; \
+   Index rows = IsLower ? rows_ : size; \
-   Index cols = IsLower ? size : _cols; \
+   Index cols = IsLower ? size : cols_; \
 \
   typedef VectorX##EIGPREFIX VectorRhs; \
   EIGTYPE *x, *y;\
 \
 /* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
   VectorRhs x_tmp; \
   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
   x = x_tmp.data(); \
@@ -210,24 +210,24 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
   diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
   if (size<(std::max)(rows,cols)) { \
     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
     x = x_tmp.data(); \
     if (size<rows) { \
-       y = _res + size*resIncr; \
+       y = res_ + size*resIncr; \
-       a = _lhs + size*lda; \
+       a = lhs_ + size*lda; \
       m = convert_index<BlasIndex>(rows-size); \
       n = convert_index<BlasIndex>(size); \
     } \
     else { \
       x += size; \
-       y = _res; \
+       y = res_; \
-       a = _lhs + size; \
+       a = lhs_ + size; \
       m = convert_index<BlasIndex>(size); \
       n = convert_index<BlasIndex>(cols-size); \
     } \
@@ -339,7 +339,7 @@
    extern "C" {
      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
+      #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
        #include <immintrin.h>
      #else
        #include <mmintrin.h>
@@ -363,10 +363,11 @@
      #endif
    } // end extern "C"
-  #elif defined __VSX__
+  #elif defined(__VSX__) && !defined(__APPLE__)
    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
+    #define EIGEN_VECTORIZE_VSX 1
    #define EIGEN_VECTORIZE_FMA
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
    // => use __vector instead of vector
@@ -378,6 +379,7 @@
    #define EIGEN_VECTORIZE
    #define EIGEN_VECTORIZE_ALTIVEC
    #define EIGEN_VECTORIZE_FMA
    #include <altivec.h>
    // We need to #undef all these ugly tokens defined in <altivec.h>
    // => use __vector instead of vector
@@ -438,13 +440,20 @@
  #include <arm_fp16.h>
 #endif
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
+// Enable FMA for ARM.
 #if defined(__ARM_FEATURE_FMA)
 #define EIGEN_VECTORIZE_FMA
 #endif
 #if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380)
  // We can use the optimized fp16 to float and float to fp16 conversion routines
  #define EIGEN_HAS_FP16_C
-  #if defined(EIGEN_COMP_CLANG)
+  #if EIGEN_COMP_GNUC
-    // Workaround for clang: The FP16C intrinsics for clang are included by
+    // Make sure immintrin.h is included, even if e.g. vectorization is
-    // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
+    // explicitly disabled (see also issue #2395).
    // Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
    // as opposed to emmintrin.h as suggested by Intel:
    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
    #include <immintrin.h>
  #endif
@@ -134,7 +134,7 @@ const unsigned int LinearAccessBit = 0x10;
  * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable.
  * This rules out read-only expressions.
  *
-  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note
+  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but not
  * the other:
  *   \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit
  *   \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit but not LvalueBit
@@ -1,9 +1,10 @@
 #ifndef EIGEN_WARNINGS_DISABLED
 #define EIGEN_WARNINGS_DISABLED
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
  // 4101 - unreferenced local variable
  // 4127 - conditional expression is constant
  // 4181 - qualifier applied to reference type ignored
  // 4211 - nonstandard extension used : redefined extern to static
  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
@@ -19,7 +20,7 @@
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma warning( push )
  #endif
-  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 #elif defined __INTEL_COMPILER
  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -35,25 +36,28 @@
  #pragma warning disable 2196 279 1684 2259
 #elif defined __clang__
  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
  //     this is really a stupid warning as it warns on compile-time expressions involving enums
  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
    #pragma clang diagnostic push
  #endif
  #if defined(__has_warning)
    // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
    //     this is really a stupid warning as it warns on compile-time expressions involving enums
    #if __has_warning("-Wconstant-logical-operand")
      #pragma clang diagnostic ignored "-Wconstant-logical-operand"
  #if __clang_major__ >= 3 && __clang_minor__ >= 5
    #pragma clang diagnostic ignored "-Wabsolute-value"
    #endif
-  #if __clang_major__ >= 10
+    #if __has_warning("-Wimplicit-int-float-conversion")
      #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
    #endif
    #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
      // warning: generic selections are a C11-specific feature
      // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
      #if __has_warning("-Wc11-extensions")
        #pragma clang diagnostic ignored "-Wc11-extensions"
      #endif
    #endif
  #endif
-#elif defined __GNUC__
+#elif defined __GNUC__ && !defined(__FUJITSU)
  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
    #pragma GCC diagnostic push
@@ -74,25 +78,53 @@
 #endif
 #if defined __NVCC__
-  #pragma diag_suppress boolean_controlling_expr_is_constant
+  // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
  // we instead use Microsoft's __pragma extension.
  #if defined _MSC_VER
    #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
  #else
    #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
  #endif
  #if defined __NVCC_DIAG_PRAGMA_SUPPORT__
    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
  #else
    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
  #endif
  EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
  // Disable the "statement is unreachable" message
-  #pragma diag_suppress code_is_unreachable
+  EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
  // Disable the "dynamic initialization in unreachable code" message
-  #pragma diag_suppress initialization_not_reachable
+  EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
  // Disable the "invalid error number" message that we get with older versions of nvcc
-  #pragma diag_suppress 1222
+  EIGEN_NV_DIAG_SUPPRESS(1222)
  // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
-  #pragma diag_suppress 2527
+  EIGEN_NV_DIAG_SUPPRESS(2527)
-  #pragma diag_suppress 2529
+  EIGEN_NV_DIAG_SUPPRESS(2529)
-  #pragma diag_suppress 2651
+  EIGEN_NV_DIAG_SUPPRESS(2651)
-  #pragma diag_suppress 2653
+  EIGEN_NV_DIAG_SUPPRESS(2653)
-  #pragma diag_suppress 2668
+  EIGEN_NV_DIAG_SUPPRESS(2668)
-  #pragma diag_suppress 2669
+  EIGEN_NV_DIAG_SUPPRESS(2669)
-  #pragma diag_suppress 2670
+  EIGEN_NV_DIAG_SUPPRESS(2670)
-  #pragma diag_suppress 2671
+  EIGEN_NV_DIAG_SUPPRESS(2671)
-  #pragma diag_suppress 2735
+  EIGEN_NV_DIAG_SUPPRESS(2735)
-  #pragma diag_suppress 2737
+  EIGEN_NV_DIAG_SUPPRESS(2737)
-  #pragma diag_suppress 2739
+  EIGEN_NV_DIAG_SUPPRESS(2739)
  EIGEN_NV_DIAG_SUPPRESS(2885)
  EIGEN_NV_DIAG_SUPPRESS(2888)
  EIGEN_NV_DIAG_SUPPRESS(2976)
  EIGEN_NV_DIAG_SUPPRESS(2979)
  EIGEN_NV_DIAG_SUPPRESS(20011)
  EIGEN_NV_DIAG_SUPPRESS(20014)
  // Disable the "// __device__ annotation is ignored on a function(...) that is
  //              explicitly defaulted on its first declaration" message.
  // The __device__ annotation seems to actually be needed in some cases,
  // otherwise resulting in kernel runtime errors.
  EIGEN_NV_DIAG_SUPPRESS(2886)
  EIGEN_NV_DIAG_SUPPRESS(2977)
  EIGEN_NV_DIAG_SUPPRESS(20012)
  #undef EIGEN_NV_DIAG_SUPPRESS
  #undef EIGEN_MAKE_PRAGMA
 #endif
 #else
@@ -168,7 +168,7 @@ template<int Size> struct get_compile_time_incr<AllRange<Size> > {
  * \ingroup Core_Module
  * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns
  */
-static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all
+static const Eigen::internal::all_t all;
 namespace placeholders {
@@ -138,7 +138,7 @@ template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
  static const int value = N;
 };
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
  static const int value = N;
 };
@@ -154,7 +154,7 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
 };
 template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif
@@ -166,7 +166,7 @@ template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct clea
 // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
 template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 // In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
 template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
 #endif
@@ -17,7 +17,7 @@
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 4
-#define EIGEN_MINOR_VERSION 0
+#define EIGEN_MINOR_VERSION 1
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -179,6 +179,13 @@
  #define EIGEN_COMP_PGI 0
 #endif
 /// \internal EIGEN_COMP_NVHPC set to NVHPC version if the compiler is nvc++
 #if defined(__NVCOMPILER)
 #define EIGEN_COMP_NVHPC (__NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__)
 #else
 #define EIGEN_COMP_NVHPC 0
 #endif
 /// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
 #if defined(__CC_ARM) || defined(__ARMCC_VERSION)
  #define EIGEN_COMP_ARM 1
@@ -275,7 +282,7 @@
 /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
 /// compliant Arm fp16 type
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM_OR_ARM64
  #ifndef EIGEN_HAS_ARM64_FP16
    #if defined(__ARM_FP16_FORMAT_IEEE)
      #define EIGEN_HAS_ARM64_FP16 1
@@ -287,7 +294,7 @@
 /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
 /// supports Neon vector intrinsics for fp16.
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM_OR_ARM64
  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
@@ -299,7 +306,7 @@
 /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
 /// supports Neon scalar intrinsics for fp16.
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM_OR_ARM64
  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
@@ -329,7 +336,7 @@
 #endif
 /// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
-#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC)
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__)
  #define EIGEN_ARCH_PPC 1
 #else
  #define EIGEN_ARCH_PPC 0
@@ -565,6 +572,32 @@
 //
 #endif
 /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
 /// supports Neon vector intrinsics for fp16.
 #if EIGEN_ARCH_ARM64
  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
    // Clang only supports FP16 on aarch64, and not all intrinsics are available
    // on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16).
    #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
    #else
      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
    #endif
  #endif
 #endif
 /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
 /// supports Neon scalar intrinsics for fp16.
 #if EIGEN_ARCH_ARM64
  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
    // Clang only supports FP16 on aarch64, and not all intrinsics are available
    // on A32 anyways, even in GCC (e.g. vceqh_f16).
    #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
    #endif
  #endif
 #endif
 #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
 // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
 // In most cases we want to check if both macros are defined which can be done using the define below.
@@ -1127,11 +1160,25 @@ namespace Eigen {
      // This seems to be broken on clang. Packet4f is loaded into a single
      //   register rather than a vector, zeroing out some entries. Integer
      //   types also generate a compile error.
-      // General, Altivec, VSX.
+      #if EIGEN_OS_MAC
        // General, Altivec for Apple (VSX were added in ISA v2.06):
        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v" (X));
      #else
        // General, Altivec, VSX otherwise:
        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
      #endif
    #elif EIGEN_ARCH_ARM_OR_ARM64
      // General, NEON.
      // Clang doesn't like "r",
      //    error: non-trivial scalar-to-vector conversion, possible invalid
      //           constraint for vector type
      // GCC < 5 doesn't like "g",
      //    error: 'asm' operand requires impossible reload
      #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0)
        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,w" (X));
      #else
        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
      #endif
    #elif EIGEN_ARCH_i386_OR_x86_64
      // General, SSE.
      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
@@ -1185,7 +1232,7 @@ namespace Eigen {
  #define EIGEN_USING_STD(FUNC) using std::FUNC;
 #endif
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC))
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1916 || (EIGEN_COMP_MSVC == 1916 && EIGEN_COMP_NVCC))
  // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary,
  //   otherwise we get duplicate definition errors
  // For later MSVC versions, we require explicit operator= definition, otherwise we get
@@ -1216,7 +1263,7 @@ namespace Eigen {
 * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
 */
 #if EIGEN_HAS_CXX11
-#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default;
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
 #else
 #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
 #endif
@@ -1241,12 +1288,12 @@ namespace Eigen {
 */
 #if EIGEN_HAS_CXX11
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() = default; \
+    EIGEN_DEVICE_FUNC Derived() = default; \
-    ~Derived() = default;
+    EIGEN_DEVICE_FUNC ~Derived() = default;
 #else
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() {}; \
+    EIGEN_DEVICE_FUNC Derived() {}; \
-    /* ~Derived() {}; */
+    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
 #endif
@@ -292,20 +292,59 @@ template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T
 /** \internal Constructs the elements of an array.
  * The \a size parameter tells on how many objects to call the constructor of T.
  */
-template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size)
 {
-  std::size_t i;
+  std::size_t i=0;
  EIGEN_TRY
  {
    for (i = 0; i < size; ++i) ::new (ptr + i) T;
      return ptr;
  }
  EIGEN_CATCH(...)
  {
    destruct_elements_of_array(ptr, i);
    EIGEN_THROW;
  }
-  return NULL;
+  return ptr;
 }
 /** \internal Copy-constructs the elements of an array.
  * The \a size parameter tells on how many objects to copy.
  */
 template<typename T> EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size)
 {
  std::size_t i=0;
  EIGEN_TRY
  {
      for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i));
  }
  EIGEN_CATCH(...)
  {
    destruct_elements_of_array(ptr, i);
    EIGEN_THROW;
  }
  return ptr;
 }
 /** \internal Move-constructs the elements of an array.
  * The \a size parameter tells on how many objects to move.
  */
 template<typename T> EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size)
 {
  std::size_t i=0;
  EIGEN_TRY
  {
 #if EIGEN_HAS_RVALUE_REFERENCES
    for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i)));
 #else
    for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i));
 #endif
  }
  EIGEN_CATCH(...)
  {
    destruct_elements_of_array(ptr, i);
    EIGEN_THROW;
  }
  return ptr;
 }
 /*****************************************************************************
@@ -326,10 +365,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s
 template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 {
  check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
+  T *result = static_cast<T*>(aligned_malloc(sizeof(T)*size));
  EIGEN_TRY
  {
-    return construct_elements_of_array(result, size);
+    return default_construct_elements_of_array(result, size);
  }
  EIGEN_CATCH(...)
  {
@@ -342,10 +381,10 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
 {
  check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  T *result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
  EIGEN_TRY
  {
-    return construct_elements_of_array(result, size);
+    return default_construct_elements_of_array(result, size);
  }
  EIGEN_CATCH(...)
  {
@@ -377,21 +416,32 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 {
  check_size_for_overflow<T>(new_size);
  check_size_for_overflow<T>(old_size);
-  if(new_size < old_size)
+  
-    destruct_elements_of_array(pts+new_size, old_size-new_size);
+  // If elements need to be explicitly initialized, we cannot simply realloc
-  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
+  // (or memcpy) the memory block - each element needs to be reconstructed.
-  if(new_size > old_size)
+  // Otherwise, objects that contain internal pointers like mpfr or
-  {
+  // AnnoyingScalar can be pointing to the wrong thing.
  T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*new_size));
  EIGEN_TRY
  {
-      construct_elements_of_array(result+old_size, new_size-old_size);
+    // Move-construct initial elements.
    std::size_t copy_size = (std::min)(old_size, new_size);
    move_construct_elements_of_array(result, pts, copy_size);
    // Default-construct remaining elements.
    if (new_size > old_size) {
      default_construct_elements_of_array(result + copy_size, new_size - old_size);
    }
    // Delete old elements.
    conditional_aligned_delete<T, Align>(pts, old_size);      
  }
  EIGEN_CATCH(...)
  {
    conditional_aligned_free<Align>(result);
    EIGEN_THROW;
  }
-  }
+
  return result;
 }
@@ -401,12 +451,12 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
  if(size==0)
    return 0; // short-cut. Also fixes Bug 884
  check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  T *result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
  if(NumTraits<T>::RequireInitialization)
  {
    EIGEN_TRY
    {
-      construct_elements_of_array(result, size);
+      default_construct_elements_of_array(result, size);
    }
    EIGEN_CATCH(...)
    {
@@ -419,24 +469,13 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
 {
  if (NumTraits<T>::RequireInitialization) {
    return conditional_aligned_realloc_new<T, Align>(pts, new_size, old_size);
  }
  check_size_for_overflow<T>(new_size);
  check_size_for_overflow<T>(old_size);
-  if(NumTraits<T>::RequireInitialization && (new_size < old_size))
+  return static_cast<T*>(conditional_aligned_realloc<Align>(static_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
    destruct_elements_of_array(pts+new_size, old_size-new_size);
  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
  if(NumTraits<T>::RequireInitialization && (new_size > old_size))
  {
    EIGEN_TRY
    {
      construct_elements_of_array(result+old_size, new_size-old_size);
    }
    EIGEN_CATCH(...)
    {
      conditional_aligned_free<Align>(result);
      EIGEN_THROW;
    }
  }
  return result;
 }
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
@@ -617,7 +656,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
    {
      if(NumTraits<T>::RequireInitialization && m_ptr)
-        Eigen::internal::construct_elements_of_array(m_ptr, size);
+        Eigen::internal::default_construct_elements_of_array(m_ptr, size);
    }
    EIGEN_DEVICE_FUNC
    ~aligned_stack_memory_handler()
@@ -668,7 +707,7 @@ struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
      m_deallocate(ptr==0)
  {
    if(NumTraits<Scalar>::RequireInitialization && object.data())
-      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+      Eigen::internal::default_construct_elements_of_array(object.data(), object.size());
    object = xpr;
  }
@@ -133,7 +133,10 @@ template<typename T> struct remove_all<T*>        { typedef typename remove_all<
 template<typename T> struct is_arithmetic      { enum { value = false }; };
 template<> struct is_arithmetic<float>         { enum { value = true }; };
 template<> struct is_arithmetic<double>        { enum { value = true }; };
 // GPU devices treat `long double` as `double`.
 #ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct is_arithmetic<long double>   { enum { value = true }; };
 #endif
 template<> struct is_arithmetic<bool>          { enum { value = true }; };
 template<> struct is_arithmetic<char>          { enum { value = true }; };
 template<> struct is_arithmetic<signed char>   { enum { value = true }; };
@@ -167,10 +170,8 @@ template<> struct is_integral<signed int>             { enum { value = true }; }
 template<> struct is_integral<unsigned int>           { enum { value = true }; };
 template<> struct is_integral<signed long>            { enum { value = true }; };
 template<> struct is_integral<unsigned long>          { enum { value = true }; };
-#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed long long>       { enum { value = true }; };
-template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned long long>     { enum { value = true }; };
 template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
 #endif
 #endif
 #if EIGEN_HAS_CXX11
@@ -189,21 +190,9 @@ template<> struct make_unsigned<signed int>       { typedef unsigned int type; }
 template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
 template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
 template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
 #if EIGEN_COMP_MSVC
 template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
 template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
 #endif
 // Some platforms define int64_t as `long long` even for C++03, where
 // `long long` is not guaranteed by the standard. In this case we are missing
 // the definition for make_unsigned. If we just define it, we run into issues
 // where `long long` doesn't exist in some compilers for C++03. We therefore add
 // the specialization for these platforms only.
 #if EIGEN_OS_MAC || EIGEN_COMP_MINGW
 template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
 template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
 #endif
 #endif
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
@@ -466,20 +455,32 @@ template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
 };
 #endif
 /** \internal
-  * Analogue of the std::size free function.
+  * Analogue of the std::ssize free function.
-  * It returns the size of the container or view \a x of type \c T
+  * It returns the signed size of the container or view \a x of type \c T
  *
  * It currently supports:
  *  - any types T defining a member T::size() const
  *  - plain C arrays as T[N]
  *
  * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
  */
-template<typename T>
+#if EIGEN_COMP_CXXVER < 20
-EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
+template <typename T>
 EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T& x) {
  return static_cast<std::ptrdiff_t>(x.size());
 }
-template<typename T,std::size_t N>
+template<typename T, std::ptrdiff_t N>
-EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
+EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; }
 #else
 template <typename T>
 EIGEN_CONSTEXPR auto index_list_size(T&& x) {
  using std::ssize;
  return ssize(std::forward<T>(x));
 }
 #endif // EIGEN_COMP_CXXVER
 /** \internal
  * Convenient struct to get the result type of a nullary, unary, binary, or
@@ -696,8 +697,7 @@ struct has_binary_operator
 template<int Y,
         int InfX = 0,
         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
+         bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))>
                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
 class meta_sqrt
 {
    enum {
@@ -316,9 +316,8 @@ void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm
  // Compute V as V = U X; now A = U T U^* = U X D X^(-1) U^* = V D V^(-1)
  m_eivec.noalias() = m_schur.matrixU() * m_matX;
  // .. and normalize the eigenvectors
-  for(Index k=0 ; k<n ; k++)
+  for (Index k = 0; k < n; k++) {
-  {
+    m_eivec.col(k).stableNormalize();
    m_eivec.col(k).normalize();
  }
 }
@@ -119,8 +119,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      : m_eivec(),
        m_alphas(),
        m_betas(),
-        m_valuesOkay(false),
+        m_computeEigenvectors(false),
-        m_vectorsOkay(false),
+        m_isInitialized(false),
        m_realQZ()
    {}
@@ -134,8 +134,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      : m_eivec(size, size),
        m_alphas(size),
        m_betas(size),
-        m_valuesOkay(false),
+        m_computeEigenvectors(false),
-        m_vectorsOkay(false),
+        m_isInitialized(false),
        m_realQZ(size),
        m_tmp(size)
    {}
@@ -156,8 +156,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      : m_eivec(A.rows(), A.cols()),
        m_alphas(A.cols()),
        m_betas(A.cols()),
-        m_valuesOkay(false),
+        m_computeEigenvectors(false),
-        m_vectorsOkay(false),
+        m_isInitialized(false),
        m_realQZ(A.cols()),
        m_tmp(A.cols())
    {
@@ -177,7 +177,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      * \sa eigenvalues()
      */
    EigenvectorsType eigenvectors() const {
-      eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors");
      eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated");
      return m_eivec;
    }
@@ -201,7 +202,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      */
    EigenvalueType eigenvalues() const
    {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues.");
      return EigenvalueType(m_alphas,m_betas);
    }
@@ -212,7 +213,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      * \sa betas(), eigenvalues() */
    ComplexVectorType alphas() const
    {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas.");
      return m_alphas;
    }
@@ -223,7 +224,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
      * \sa alphas(), eigenvalues() */
    VectorType betas() const
    {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas.");
      return m_betas;
    }
@@ -254,7 +255,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
    ComputationInfo info() const
    {
-      eigen_assert(m_valuesOkay && "EigenSolver is not initialized.");
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
      return m_realQZ.info();
    }
@@ -277,7 +278,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
    EigenvectorsType m_eivec;
    ComplexVectorType m_alphas;
    VectorType m_betas;
-    bool m_valuesOkay, m_vectorsOkay;
+    bool m_computeEigenvectors;
    bool m_isInitialized;
    RealQZ<MatrixType> m_realQZ;
    ComplexVectorType m_tmp;
 };
@@ -292,8 +294,6 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
  using std::abs;
  eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
  Index size = A.cols();
  m_valuesOkay = false;
  m_vectorsOkay = false;
  // Reduce to generalized real Schur form:
  // A = Q S Z and B = Q T Z
  m_realQZ.compute(A, B, computeEigenvectors);
@@ -406,10 +406,9 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
        i += 2;
      }
    }
    m_valuesOkay = true;
    m_vectorsOkay = computeEigenvectors;
  }
  m_computeEigenvectors = computeEigenvectors;
  m_isInitialized = true;
  return *this;
 }
@@ -435,9 +435,10 @@ inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& ex
  shiftInfo.coeffRef(1) = m_matT.coeff(iu-1,iu-1);
  shiftInfo.coeffRef(2) = m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu);
  // Alternate exceptional shifting strategy every 16 iterations.
  if (iter % 16 == 0) {
    // Wilkinson's original ad hoc shift
-  if (iter == 10)
+    if (iter % 32 != 0) {
  {
      exshift += shiftInfo.coeff(0);
      for (Index i = 0; i <= iu; ++i)
        m_matT.coeffRef(i,i) -= shiftInfo.coeff(0);
@@ -445,11 +446,8 @@ inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& ex
      shiftInfo.coeffRef(0) = Scalar(0.75) * s;
      shiftInfo.coeffRef(1) = Scalar(0.75) * s;
      shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s;
-  }
+    } else {
      // MATLAB's new ad hoc shift
  if (iter == 30)
  {
      Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
      s = s * s + shiftInfo.coeff(2);
      if (s > Scalar(0))
@@ -465,6 +463,7 @@ inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& ex
        shiftInfo.setConstant(Scalar(0.964));
      }
    }
  }
 }
 /** \internal Compute index im at which Francis QR step starts and the first Householder vector. */
@@ -440,9 +440,8 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal
 template<typename MatrixType, int Size, bool IsComplex>
 struct tridiagonalization_inplace_selector
 {
  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
  typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  template<typename DiagonalType, typename SubDiagonalType>
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
  static EIGEN_DEVICE_FUNC
      void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ)
  {
@@ -985,7 +985,10 @@ Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
 {
  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
+  LinearMatrixType shear = LinearMatrixType::Identity(2, 2);
  shear.coeffRef(0, 1) = sy;
  shear.coeffRef(1, 0) = sx;
  m_matrix.template block<Dim, HDim>(0, 0) = shear * m_matrix.template block<Dim, HDim>(0, 0);
  return *this;
 }
@@ -136,8 +136,10 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
  // Eq. (39)
  VectorType S = VectorType::Ones(m);
-  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 )
+  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) {
-    S(m-1) = -1;
+    Index tmp = m - 1;  
    S(tmp) = -1;
  }
  // Eq. (40) and (43)
  Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
@@ -69,7 +69,7 @@ void MatrixBase<Derived>::makeHouseholder(
  Scalar& tau,
  RealScalar& beta) const
 {
-  using std::sqrt;
+  using numext::sqrt;
  using numext::conj;
  EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
@@ -14,7 +14,9 @@
 namespace Eigen {
 /** \ingroup Householder_Module
  *
  * \householder_module
  *
  * \class HouseholderSequence
  * \brief Sequence of Householder reflections acting on subspaces with decreasing size
  * \tparam VectorsType type of matrix containing the Householder vectors
@@ -518,7 +520,10 @@ typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,Ot
  return res;
 }
-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
  * 
  * \householder_module
  *
  * \brief Convenience function for constructing a Householder sequence.
  * \returns A HouseholderSequence constructed from the specified arguments.
  */
@@ -528,7 +533,10 @@ HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsTyp
  return HouseholderSequence<VectorsType,CoeffsType,OnTheLeft>(v, h);
 }
-/** \ingroup Householder_Module \householder_module
+/** \ingroup Householder_Module
  *
  * \householder_module
  *
  * \brief Convenience function for constructing a Householder sequence.
  * \returns A HouseholderSequence constructed from the specified arguments.
  * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
@@ -49,9 +49,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
    x.setZero();
    return true;
  }
-  Scalar rho    = 1;
+  Scalar rho    (1);
-  Scalar alpha  = 1;
+  Scalar alpha  (1);
-  Scalar w      = 1;
+  Scalar w      (1);
  VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
  VectorType y(n),  z(n);
@@ -29,8 +29,6 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
                        const Preconditioner& precond, Index& iters,
                        typename Dest::RealScalar& tol_error)
 {
  using std::sqrt;
  using std::abs;
  typedef typename Dest::RealScalar RealScalar;
  typedef typename Dest::Scalar Scalar;
  typedef Matrix<Scalar,Dynamic,1> VectorType;
@@ -56,7 +54,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
  if (residualNorm2 < threshold)
  {
    iters = 0;
-    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
    return;
  }
@@ -86,7 +84,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
    p = z + beta * p;                           // update search direction
    i++;
  }
-  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
  iters = i;
 }
@@ -160,13 +160,13 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
    }
    /** \returns the sparse lower triangular factor L */
-    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+    const FactorType& matrixL() const { eigen_assert(m_factorizationIsOk && "factorize() should be called first"); return m_L; }
    /** \returns a vector representing the scaling factor S */
-    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+    const VectorRx& scalingS() const { eigen_assert(m_factorizationIsOk && "factorize() should be called first"); return m_scale; }
    /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
-    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+    const PermutationType& permutationP() const { eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); return m_perm; }
  protected:
    FactorType m_L;              // The lower part stored in CSC
@@ -35,6 +35,13 @@
 #ifndef EIGEN_INVERSE_SIZE_4_H
 #define EIGEN_INVERSE_SIZE_4_H
 #if EIGEN_COMP_GNUC_STRICT
 // These routines requires bit manipulation of the sign, which is not compatible
 // with fastmath.
 #pragma GCC push_options
 #pragma GCC optimize ("no-fast-math")
 #endif
 namespace Eigen
 {
 namespace internal
@@ -143,8 +150,8 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType
    iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
    iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
-    const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
+    EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
-    const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
+    const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
    rd = pxor(rd, p4f_sign_PNNP);
    iA = pmul(iA, rd);
    iB = pmul(iB, rd);
@@ -326,10 +333,10 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
    iC1 = psub(pmul(B1, dC), iC1);
    iC2 = psub(pmul(B2, dC), iC2);
-    const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
+    EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
-    const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
+    EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
-    const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
+    const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
-    const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
+    const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
    d1 = pxor(rd, sign_PN);
    d2 = pxor(rd, sign_NP);
@@ -348,4 +355,9 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
 #endif
 } // namespace internal
 } // namespace Eigen
 #if EIGEN_COMP_GNUC_STRICT
 #pragma GCC pop_options
 #endif
 #endif
@@ -258,12 +258,12 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
    int m_ordering; // Ordering method to use, see SPQR's manual
    int m_allow_tol; // Allow to use some tolerance during numerical factorization.
    RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
-    mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
+    mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format
    mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable StorageIndex *m_E; // The permutation applied to columns
+    mutable StorageIndex *m_E = nullptr; // The permutation applied to columns
-    mutable cholmod_sparse *m_H;  //The householder vectors
+    mutable cholmod_sparse *m_H = nullptr;  //The householder vectors
-    mutable StorageIndex *m_HPinv; // The row permutation of H
+    mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H
-    mutable cholmod_dense *m_HTau; // The Householder coefficients
+    mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients
    mutable Index m_rank; // The rank of the matrix
    mutable cholmod_common m_cc; // Workspace and parameters
    bool m_useDefaultThreshold;     // Use default threshold
@@ -27,6 +27,10 @@
 #define eigen_internal_assert(X) assert(X);
 #endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
 #include <iostream>
 #endif
 namespace Eigen {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -172,7 +176,7 @@ public:
  void setSwitchSize(int s) 
  {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3.");
    m_algoswap = s;
  }
@@ -404,7 +408,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
 // lastCol + 1 - firstCol is the size of the submatrix.
 //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
+//@param firstColW : Same as firstRowW with the column.
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
@@ -899,7 +903,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
      eigen_internal_assert(fLeft<Literal(0));
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
@@ -974,8 +978,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
    // (deflation is supposed to avoid this from happening)
    // - this does no seem to be necessary anymore -
-//     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
-//     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
  }
 }
@@ -1029,7 +1033,14 @@ void BDCSVD<MatrixType>::perturbCol0
            std::cout << "  " << "j=" << j << "\n";
          }
 #endif
-          Index j = i<k ? i : perm(l-1);
+          // Avoid index out of bounds.
          // Will end up setting zhat(k) = 0.
          if (i >= k && l == 0) {
            m_info = NumericalIssue;
            prod = 0;
            break;
          }
          Index j = i<k ? i : l > 0 ? perm(l-1) : i;
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
          if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
          {
@@ -1242,8 +1253,8 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
  {
    // Check for total deflation
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
-    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
+    const bool total_deflation = (col0.tail(length-1).array().abs()<considerZero).all();
    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
    // First, compute the respective permutation.
@@ -680,6 +680,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
  if (!(numext::isfinite)(scale)) {
    m_isInitialized = true;
    m_info = InvalidInput;
    m_nonzeroSingularValues = 0;
    return *this;
  }
  if(scale==RealScalar(0)) scale = RealScalar(1);
@@ -161,9 +161,10 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
  typedef typename MatrixType::Scalar Scalar;
  typedef typename MatrixType::RealScalar RealScalar;
  typedef typename NumTraits<RealScalar>::Literal Literal;
-  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  static const int StorageOrder =
-  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
+      (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
-  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
+  typedef InnerStride<StorageOrder == ColMajor ? 1 : Dynamic> ColInnerStride;
  typedef InnerStride<StorageOrder == ColMajor ? Dynamic : 1> RowInnerStride;
  typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride>    SubColumnType;
  typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride>    SubRowType;
  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder > > SubMatType;
@@ -293,7 +294,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona
  Index size = (std::min)(rows, cols);
  // X and Y are work space
-  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  enum { StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor };
  Matrix<Scalar,
         MatrixType::RowsAtCompileTime,
         Dynamic,
@@ -429,12 +429,7 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
    enum {
      IsRowMajor = XprType::IsRowMajor,
-
+      OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor),
      OuterVector =  (BlockCols==1 && ArgType::IsRowMajor)
                    | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
                      // revert to || as soon as not needed anymore.
                     (BlockRows==1 && !ArgType::IsRowMajor),
      CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
      Flags = XprType::Flags
    };
@@ -237,6 +237,7 @@ class Map<SparseMatrixType>
    /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
      * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
      * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
      * The inner indices must be sorted appropriately.
      *
      * This constructor is available only if \c SparseMatrixType is non-const.
      *
@@ -781,18 +781,17 @@ class SparseMatrix
      return *this;
    }
 #ifndef EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
    inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
    { return Base::operator=(other.derived()); }
    template<typename Lhs, typename Rhs>
    inline SparseMatrix& operator=(const Product<Lhs,Rhs,AliasFreeProduct>& other);
 #endif // EIGEN_PARSED_BY_DOXYGEN
    template<typename OtherDerived>
    EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
 #ifndef EIGEN_NO_IO
    friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
    {
      EIGEN_DBG_SPARSE(
@@ -837,6 +836,7 @@ class SparseMatrix
      s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
      return s;
    }
 #endif
    /** Destructor */
    inline ~SparseMatrix()
@@ -113,7 +113,7 @@ template<typename Derived> class SparseMatrixBase
                        Transpose<const Derived>
                     >::type AdjointReturnType;
    typedef Transpose<Derived> TransposeReturnType;
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
    // FIXME storage order do not match evaluator storage order
    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
@@ -214,7 +214,7 @@ template<typename Derived> class SparseMatrixBase
    inline void assignGeneric(const OtherDerived& other);
  public:
-
+#ifndef EIGEN_NO_IO
    friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
    {
      typedef typename Derived::Nested Nested;
@@ -263,6 +263,7 @@ template<typename Derived> class SparseMatrixBase
      }
      return s;
    }
 #endif
    template<typename OtherDerived>
    Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
@@ -165,6 +165,7 @@ protected:
 } // end namespace internal
 // sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.)
 template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename Lhs, typename Rhs>
 SparseMatrix<Scalar,_Options,_StorageIndex>& SparseMatrix<Scalar,_Options,_StorageIndex>::operator=(const Product<Lhs,Rhs,AliasFreeProduct>& src)
@@ -90,9 +90,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
  {
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
+    typename remove_all<ResultType>::type res_(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, res_, tolerance);
-    res.swap(_res);
+    res.swap(res_);
  }
 };
@@ -104,9 +104,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
  {
    // we need a col-major matrix to hold the result
    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
-    SparseTemporaryType _res(res.rows(), res.cols());
+    SparseTemporaryType res_(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, res_, tolerance);
-    res = _res;
+    res = res_;
  }
 };
@@ -117,9 +117,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
  {
    // let's transpose the product to get a column x column product
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
+    typename remove_all<ResultType>::type res_(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance);
+    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, res_, tolerance);
-    res.swap(_res);
+    res.swap(res_);
  }
 };
@@ -137,9 +137,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
    // let's transpose the product to get a column x column product
 //     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
-//     SparseTemporaryType _res(res.cols(), res.rows());
+//     SparseTemporaryType res_(res.cols(), res.rows());
-//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res);
+//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, res_);
-//     res = _res.transpose();
+//     res = res_.transpose();
  }
 };
@@ -329,6 +329,7 @@ class SparseVector
    }
    #endif
 #ifndef EIGEN_NO_IO
    friend std::ostream & operator << (std::ostream & s, const SparseVector& m)
    {
      for (Index i=0; i<m.nonZeros(); ++i)
@@ -336,6 +337,7 @@ class SparseVector
      s << std::endl;
      return s;
    }
 #endif
    /** Destructor */
    inline ~SparseVector() {}
@@ -270,11 +270,11 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
      }
-      Index count = 0;
+//       Index count = 0;
      // FIXME compute a reference value to filter zeros
      for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector/*,1e-12*/); it; ++it)
      {
-        ++ count;
+//         ++ count;
 //         std::cerr << "fill " << it.index() << ", " << col << "\n";
 //         std::cout << it.value() << "  ";
        // FIXME use insertBack
@@ -35,9 +35,10 @@ public:
    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
  };
-  SparseLUTransposeView() : m_sparseLU(NULL) {}
+  SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {}
-  SparseLUTransposeView(const SparseLUTransposeView& view) {
+  SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() {
    this->m_sparseLU = view.m_sparseLU;
    this->m_isInitialized = view.m_isInitialized;
  }
  void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
  void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
@@ -752,10 +753,13 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
      info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
      if ( info ) 
      {
-        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT ";
+        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
 #ifndef EIGEN_NO_IO
        std::ostringstream returnInfo;
        returnInfo << " ... ZERO COLUMN AT ";
        returnInfo << info;
        m_lastError += returnInfo.str();
 #endif
        m_info = NumericalIssue; 
        m_factorizationIsOk = false; 
        return; 
@@ -830,7 +834,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
  template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
  {
    Index nrhs = X.cols();
    Index n    = X.rows();
    // Backward solve with U
    for (Index k = m_mapL.nsuper(); k >= 0; k--)
    {
@@ -850,7 +853,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
      {
        // FIXME: the following lines should use Block expressions and not Map!
        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
        U = A.template triangularView<Upper>().solve(U);
      }
@@ -873,7 +876,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
  {
    using numext::conj;
    Index nrhs = X.cols();
    Index n    = X.rows();
    // Forward solve with U
    for (Index k = 0; k <=  m_mapL.nsuper(); k++)
    {
@@ -904,7 +906,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
      else
      {
        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
        if(Conjugate)
          U = A.adjoint().template triangularView<Lower>().solve(U);
        else
@@ -71,7 +71,7 @@
 namespace Eigen {
 namespace internal {
-typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; 
+enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL};
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
@@ -274,9 +274,8 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) co
        // Triangular solve 
        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
        U = A.template triangularView<UnitLower>().solve(U);        
        // Matrix-vector product 
        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
        work.topRows(nrow).noalias() = A * U;
@@ -349,7 +348,7 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveTransposedInPlace( MatrixBase<D
      // Matrix-vector product with transposed submatrix
      Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
      if(Conjugate)
        U = U - A.adjoint() * work.topRows(nrow);
      else
@@ -1,280 +0,0 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
 // Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 #ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
 #define EIGEN_SPARSELU_GEMM_KERNEL_H
 namespace Eigen {
 namespace internal {
 /** \internal
  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
  *  - A, B, and C must be column major
  *  - lda and ldc must be multiples of the respective packet size
  *  - C must have the same alignment as A
  */
 template<typename Scalar>
 EIGEN_DONT_INLINE
 void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
 {
  using namespace Eigen::internal;
  typedef typename packet_traits<Scalar>::type Packet;
  enum {
    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
    PacketSize = packet_traits<Scalar>::size,
    PM = 8,                             // peeling in M
    RN = 2,                             // register blocking
    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
    SM = PM*PacketSize                  // step along M
  };
  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
  Index i0 = internal::first_default_aligned(A,m);
  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
  // handle the non aligned rows of A and C without any optimization:
  for(Index i=0; i<i0; ++i)
  {
    for(Index j=0; j<n; ++j)
    {
      Scalar c = C[i+j*ldc];
      for(Index k=0; k<d; ++k)
        c += B[k+j*ldb] * A[i+k*lda];
      C[i+j*ldc] = c;
    }
  }
  // process the remaining rows per chunk of BM rows
  for(Index ib=i0; ib<m; ib+=BM)
  {
    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
    // Let's process two columns of B-C at once
    for(Index j=0; j<n_end; j+=RN)
    {
      const Scalar* Bc0 = B+(j+0)*ldb;
      const Scalar* Bc1 = B+(j+1)*ldb;
      for(Index k=0; k<d_end; k+=RK)
      {
        // load and expand a RN x RK block of B
        Packet b00, b10, b20, b30, b01, b11, b21, b31;
                  { b00 = pset1<Packet>(Bc0[0]); }
                  { b10 = pset1<Packet>(Bc0[1]); }
        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
                  { b01 = pset1<Packet>(Bc1[0]); }
                  { b11 = pset1<Packet>(Bc1[1]); }
        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
        Packet a0, a1, a2, a3, c0, c1, t0, t1;
        const Scalar* A0 = A+ib+(k+0)*lda;
        const Scalar* A1 = A+ib+(k+1)*lda;
        const Scalar* A2 = A+ib+(k+2)*lda;
        const Scalar* A3 = A+ib+(k+3)*lda;
        Scalar* C0 = C+ib+(j+0)*ldc;
        Scalar* C1 = C+ib+(j+1)*ldc;
                  a0 = pload<Packet>(A0);
                  a1 = pload<Packet>(A1);
        if(RK==4)
        {
          a2 = pload<Packet>(A2);
          a3 = pload<Packet>(A3);
        }
        else
        {
          // workaround "may be used uninitialized in this function" warning
          a2 = a3 = a0;
        }
 #define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
 #define WORK(I)  \
                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
                     KMADD(c0, a0, b00, t0)                      \
                     KMADD(c1, a0, b01, t1)                      \
                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
                     KMADD(c0, a1, b10, t0)                      \
                     KMADD(c1, a1, b11, t1)                      \
                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
                     pstore(C0+i+(I)*PacketSize, c0);            \
                     pstore(C1+i+(I)*PacketSize, c1)
        // process rows of A' - C' with aggressive vectorization and peeling 
        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
        {
          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
                    prefetch((A0+i+(5)*PacketSize));
                    prefetch((A1+i+(5)*PacketSize));
          if(RK==4) prefetch((A2+i+(5)*PacketSize));
          if(RK==4) prefetch((A3+i+(5)*PacketSize));
          WORK(0);
          WORK(1);
          WORK(2);
          WORK(3);
          WORK(4);
          WORK(5);
          WORK(6);
          WORK(7);
        }
        // process the remaining rows with vectorization only
        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
        {
          WORK(0);
        }
 #undef WORK
        // process the remaining rows without vectorization
        for(Index i=actual_b_end2; i<actual_b; ++i)
        {
          if(RK==4)
          {
            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
          }
          else
          {
            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
          }
        }
        Bc0 += RK;
        Bc1 += RK;
      } // peeled loop on k
    } // peeled loop on the columns j
    // process the last column (we now perform a matrix-vector product)
    if((n-n_end)>0)
    {
      const Scalar* Bc0 = B+(n-1)*ldb;
      for(Index k=0; k<d_end; k+=RK)
      {
        // load and expand a 1 x RK block of B
        Packet b00, b10, b20, b30;
                  b00 = pset1<Packet>(Bc0[0]);
                  b10 = pset1<Packet>(Bc0[1]);
        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
        const Scalar* A0 = A+ib+(k+0)*lda;
        const Scalar* A1 = A+ib+(k+1)*lda;
        const Scalar* A2 = A+ib+(k+2)*lda;
        const Scalar* A3 = A+ib+(k+3)*lda;
        Scalar* C0 = C+ib+(n_end)*ldc;
                  a0 = pload<Packet>(A0);
                  a1 = pload<Packet>(A1);
        if(RK==4)
        {
          a2 = pload<Packet>(A2);
          a3 = pload<Packet>(A3);
        }
        else
        {
          // workaround "may be used uninitialized in this function" warning
          a2 = a3 = a0;
        }
 #define WORK(I) \
                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
                   KMADD(c0, a0, b00, t0)                       \
                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
                   KMADD(c0, a1, b10, t0)                       \
                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
                   pstore(C0+i+(I)*PacketSize, c0);
        // aggressive vectorization and peeling
        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
        {
          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
          WORK(0);
          WORK(1);
          WORK(2);
          WORK(3);
          WORK(4);
          WORK(5);
          WORK(6);
          WORK(7);
        }
        // vectorization only
        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
        {
          WORK(0);
        }
        // remaining scalars
        for(Index i=actual_b_end2; i<actual_b; ++i)
        {
          if(RK==4) 
            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
          else
            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
        }
        Bc0 += RK;
 #undef WORK
      }
    }
    // process the last columns of A, corresponding to the last rows of B
    Index rd = d-d_end;
    if(rd>0)
    {
      for(Index j=0; j<n; ++j)
      {
        enum {
          Alignment = PacketSize>1 ? Aligned : 0
        };
        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
      }
    }
  } // blocking on the rows of A and C
 }
 #undef KMADD
 } // namespace internal
 } // namespace Eigen
 #endif // EIGEN_SPARSELU_GEMM_KERNEL_H
@@ -75,8 +75,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
  // Identify the relaxed supernodes by postorder traversal of the etree
  Index snode_start; // beginning of a snode 
  StorageIndex k;
  Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
  Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
  StorageIndex l; 
  for (j = 0; j < n; )
  {
@@ -88,7 +86,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
      parent = et(j);
    }
    // Found a supernode in postordered etree, j is the last column 
    ++nsuper_et_post;
    k = StorageIndex(n);
    for (Index i = snode_start; i <= j; ++i)
      k = (std::min)(k, inv_post(i));
@@ -97,7 +94,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
    {
      // This is also a supernode in the original etree
      relax_end(k) = l; // Record last column 
      ++nsuper_et; 
    }
    else 
    {
@@ -107,7 +103,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
        if (descendants(i) == 0) 
        {
          relax_end(l) = l;
          ++nsuper_et;
        }
      }
    }
@@ -69,8 +69,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg
  Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
  Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
-  l.setZero();
+  l.noalias() = B * u;
  internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride());
  // Scatter tempv[] into SPA dense[] as a temporary storage 
  isub = lptr + no_zeros;
@@ -148,8 +148,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w,
      Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
      MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
-      L.setZero();
+      L.noalias() = B * U;
      internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
      // scatter U and L
      u_col = 0;
@@ -69,7 +69,7 @@ namespace internal {
  * detailed in the following paper:
  * <i>
  * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
-  * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011.
+  * Sparse QR Factorization", ACM Trans. on Math. Soft. 38(1), 2011.
  * </i>
  * Even though it is qualified as "rank-revealing", this strategy might fail for some 
  * rank deficient problems. When this class is used to solve linear or least-square problems
--- a/Show More
+++ b/Show More