Extern :

aggiornamento Eigen alla versione 3.3.8.
2020-10-08 08:17:27 +00:00
parent 497a533959
commit d5b135998e
66 changed files with 1047 additions and 704 deletions
@@ -1,19 +0,0 @@
-include(RegexUtils)
-test_escape_string_as_regex()
-
-file(GLOB Eigen_directory_files "*")
-
-escape_string_as_regex(ESCAPED_CMAKE_CURRENT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
-
-foreach(f ${Eigen_directory_files})
-  if(NOT f MATCHES "\\.txt" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/[.].+" AND NOT f MATCHES "${ESCAPED_CMAKE_CURRENT_SOURCE_DIR}/src")
-    list(APPEND Eigen_directory_files_to_install ${f})
-  endif()
-endforeach(f ${Eigen_directory_files})
-
-install(FILES
-  ${Eigen_directory_files_to_install}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel
-  )
-
-install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
@@ -279,7 +279,10 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <iosfwd>
+#include <sstream>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
 #include <limits>
@@ -375,7 +378,9 @@ using std::ptrdiff_t;

 #if defined EIGEN_VECTORIZE_AVX512
  #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
  #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
  #include "src/Core/arch/AVX512/PacketMath.h"
  #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
@@ -10,14 +10,14 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
  *
  *
@@ -10,12 +10,12 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Geometry_Module Geometry module
  *
  * This module provides support for:
@@ -10,12 +10,12 @@

 #include "Core"

-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"

+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
  *
  *
@@ -28,7 +28,6 @@
  * 
  */

-#include "OrderingMethods"
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"

@@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }

  protected:
-    EIGEN_DEVICE_FUNC
-    ArrayBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)

  private:
    explicit ArrayBase(Index);
@@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
    {
      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
 };

 } // end namespace Eigen
@@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() {
  */
 template<typename Derived> class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public DenseCoeffsBase<Derived>
+  : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
 #else
  : public DenseCoeffsBase<Derived,DirectWriteAccessors>
 #endif // not EIGEN_PARSED_BY_DOXYGEN
@@ -71,7 +71,7 @@ template<typename Derived> class DenseBase
    typedef Scalar value_type;
    
    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef DenseCoeffsBase<Derived> Base;
+    typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;

    using Base::derived;
    using Base::const_cast_derived;
@@ -587,11 +587,12 @@ template<typename Derived> class DenseBase
    }

  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
    /** Default constructor. Do nothing. */
    EIGEN_DEVICE_FUNC DenseBase()
    {
      /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
       */
 #ifdef EIGEN_INTERNAL_DEBUGGING
      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
@@ -404,7 +404,7 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
      if(size != m_rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
@@ -479,7 +479,7 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
      if(size != _Rows*m_cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
@@ -553,7 +553,7 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
      if(size != m_rows*_Cols)
      {
        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size)
+        if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative
          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
        else
          m_data = 0;
@@ -351,10 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  // FIXME: uncomment the following in case we drop the internal imag and real functions.
-//   using std::imag;
-//   using std::real;
-  return Packet(imag(a),real(a));
+  return Packet(a.imag(),a.real());
 }

 /**************************
@@ -524,10 +521,10 @@ inline void palign(PacketType& first, const PacketType& second)
 #ifndef __CUDACC__

 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+{ return std::complex<double>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }

 #endif

@@ -182,6 +182,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
    #endif

  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)

    template<typename T>
    EIGEN_DEVICE_FUNC
@@ -294,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
    // see bugs 821 and 920.
    using ReadOnlyMapBase::Base::operator=;
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };

 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
@@ -287,7 +287,7 @@ struct abs2_impl_default<Scalar, true> // IsComplex
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
-    return real(x)*real(x) + imag(x)*imag(x);
+    return x.real()*x.real() + x.imag()*x.imag();
  }
 };

@@ -313,14 +313,17 @@ struct abs2_retval
 ****************************************************************************/

 template<typename Scalar, bool IsComplex>
-struct norm1_default_impl
+struct norm1_default_impl;
+
+template<typename Scalar>
+struct norm1_default_impl<Scalar,true>
 {
  typedef typename NumTraits<Scalar>::Real RealScalar;
  EIGEN_DEVICE_FUNC
  static inline RealScalar run(const Scalar& x)
  {
    EIGEN_USING_STD_MATH(abs);
-    return abs(real(x)) + abs(imag(x));
+    return abs(x.real()) + abs(x.imag());
  }
 };

@@ -662,8 +665,8 @@ struct random_default_impl<Scalar, true, false>
 {
  static inline Scalar run(const Scalar& x, const Scalar& y)
  {
-    return Scalar(random(real(x), real(y)),
-                  random(imag(x), imag(y)));
+    return Scalar(random(x.real(), y.real()),
+                  random(x.imag(), y.imag()));
  }
  static inline Scalar run()
  {
@@ -916,6 +919,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
 }

+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -464,7 +464,8 @@ template<typename Derived> class MatrixBase
    EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)

  protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)

  private:
    EIGEN_DEVICE_FUNC explicit MatrixBase(int);
@@ -737,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
    {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
-                          bool(NumTraits<T1>::IsInteger),
+      const bool t0_is_integer_alike = internal::is_valid_index_type<T0>::value;
+      const bool t1_is_integer_alike = internal::is_valid_index_type<T1>::value;
+      EIGEN_STATIC_ASSERT(t0_is_integer_alike &&
+                          t1_is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(rows,cols);
    }
@@ -773,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                                                                              && ((!internal::is_same<typename internal::traits<Derived>::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0)
    {
      // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
-      const bool is_integer = NumTraits<T>::IsInteger;
-      EIGEN_UNUSED_VARIABLE(is_integer);
-      EIGEN_STATIC_ASSERT(is_integer,
+      const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+      EIGEN_UNUSED_VARIABLE(is_integer_alike);
+      EIGEN_STATIC_ASSERT(is_integer_alike,
                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
      resize(size);
    }
@@ -396,7 +396,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
    // but easier on the compiler side
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar,Scalar>());
  }
-  
+
  template<typename Dst>
  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
@@ -410,6 +410,32 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
    // dst.noalias() -= lhs.lazyProduct(rhs);
    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
  }
+
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
+  }
+
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
+  {
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+  
  
 //   template<typename Dst>
 //   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
@@ -28,12 +28,13 @@ struct traits<Ref<_PlainObjectType, _Options, _StrideType> >

  template<typename Derived> struct match {
    enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
      HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
+      StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = Derived::IsVectorAtCompileTime
+      OuterStrideMatch = IsVectorAtCompileTime
                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
      // to workaround a very strange bug in MSVC related to the instantiation
@@ -19,7 +19,7 @@ namespace internal {
 template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;

-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder, int OtherInnerStride>
 struct triangular_solve_matrix;

 // small helper struct extracting some traits on the underlying solver operation
@@ -98,8 +98,8 @@ struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);

    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime>
+      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking);
  }
 };

@@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
    {
      return derived().nestedExpression().coeffRef(index);
    }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };

 /** \returns an expression of the transpose of *this.
@@ -217,9 +217,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
    explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
    {}
    
-    using Base::operator=;
-    TriangularView& operator=(const TriangularView &other)
-    { return Base::operator=(other); }
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)

    /** \copydoc EigenBase::rows() */
    EIGEN_DEVICE_FUNC
@@ -544,6 +542,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
    template<typename ProductType>
    EIGEN_DEVICE_FUNC
    EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+
 };

 /***************************************************************************
@@ -29,6 +29,7 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
  const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))

+
 // Natural logarithm
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
  // The smallest non denormalized float number.
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);

  // Polynomial coefficients.
@@ -64,11 +66,9 @@ plog<Packet16f>(const Packet16f& _x) {
  _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);

  // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
-
+  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
+  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
+      
  // Truncate input values to the minimum positive normal.
  x = pmax(x, p16f_min_norm_pos);

@@ -118,11 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
  x = padd(x, y);
  x = padd(x, y2);

-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN,
+  //  - 0 will be -INF.
+  //  - +INF will be +INF
  return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
-                              p16f_minus_inf);
+            _mm512_mask_blend_ps(invalid_mask,
+              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+              p16f_nan),
+            p16f_minus_inf);
 }
+
 #endif

 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -258,48 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));

-  Packet16f neg_half = pmul(_x, p16f_minus_half);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_rsqrt14_ps(_x);

  // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));

-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }

 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));

-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d x = _mm512_rsqrt14_pd(_x);

-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
-
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));

  // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));

-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>
@@ -19,10 +19,10 @@ namespace internal {
 #endif

 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif

-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -54,13 +54,14 @@ template<> struct packet_traits<float>  : default_packet_traits
    AlignedOnScalar = 1,
    size = 16,
    HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
+    HasBlend = 0,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
    HasLog = 1,
 #endif
    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
+    HasSqrt = EIGEN_FAST_MATH,
+    HasRsqrt = EIGEN_FAST_MATH,
 #endif
    HasDiv = 1
  };
@@ -74,8 +75,8 @@ template<> struct packet_traits<double> : default_packet_traits
    AlignedOnScalar = 1,
    size = 8,
    HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3)
-    HasSqrt = 1,
+#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+    HasSqrt = EIGEN_FAST_MATH,
    HasRsqrt = EIGEN_FAST_MATH,
 #endif
    HasDiv = 1
@@ -98,6 +99,7 @@ template <>
 struct unpacket_traits<Packet16f> {
  typedef float type;
  typedef Packet8f half;
+  typedef Packet16i integer_packet;
  enum { size = 16, alignment=Aligned64 };
 };
 template <>
@@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
-  return _mm512_broadcastsd_pd(_mm_load_pd1(from));
+  return _mm512_set1_pd(*from);
 }

 template <>
@@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_add_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_add_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a,
@@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_sub_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_sub_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
@@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
  return _mm512_mul_pd(a, b);
 }
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_mul_epi32(a, b);
+}

 template <>
 EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a,
@@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a,
  return _mm512_div_pd(a, b);
 }

-#ifdef __FMA__
+#ifdef EIGEN_VECTORIZE_FMA
 template <>
 EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b,
                                    const Packet16f& c) {
@@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
                                              const Packet16f& b) {
-  return _mm512_min_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
-  return _mm512_min_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_pd(b, a);
 }

 template <>
 EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a,
                                              const Packet16f& b) {
-  return _mm512_max_ps(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_ps(b, a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a,
                                            const Packet8d& b) {
-  return _mm512_max_pd(a, b);
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_pd(b, a);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); }
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); }
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); }
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return  _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template<int I_> EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)),
+                                                _mm256_castps_si256(b),1));
+}
+#endif
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a,
+                                              const Packet16i& b) {
+  return _mm512_and_si512(a,b);
 }

 template <>
@@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_and_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
@@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a,

  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a,
-                                             const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_or_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }

@@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a,
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_or_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

 template <>
-EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a,
-                                              const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_xor_ps(a, b);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
+
 template <>
-EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a,
-                                            const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
  return _mm512_xor_pd(a, b);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

 template <>
-EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a,
-                                                 const Packet16f& b) {
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_ps(a, b);
+  return _mm512_andnot_ps(b, a);
 #else
-  Packet16f res = _mm512_undefined_ps();
-  Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0);
-
-  Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1);
-
-  Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2);
-
-  Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3);
-  Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3);
-  res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3);
-
-  return res;
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b)));
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,
-                                               const Packet8d& b) {
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a,const Packet8d& b) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return _mm512_andnot_pd(a, b);
+  return _mm512_andnot_pd(b, a);
 #else
-  Packet8d res = _mm512_undefined_pd();
-  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0);
-
-  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
-  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
-  res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1);
-
-  return res;
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b)));
 #endif
 }

+template<int N> EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+  return _mm512_srai_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+  return _mm512_srli_epi32(a, N);
+}
+
+template<int N> EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+  return _mm512_slli_epi32(a, N);
+}
+
 template <>
 EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
@@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
-  Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from);
-  // mimic an "inplace" permutation of the lower 128bits using a blend
-  lane0 = _mm256_blend_ps(
-      lane0, _mm256_castps128_ps256(_mm_permute_ps(
-                 _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))),
-      15);
-  // then we can perform a consistent permutation on the global register to get
-  // everything in shape:
-  lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2));
-
-  Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4));
-  // mimic an "inplace" permutation of the lower 128bits using a blend
-  lane1 = _mm256_blend_ps(
-      lane1, _mm256_castps128_ps256(_mm_permute_ps(
-                 _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))),
-      15);
-  // then we can perform a consistent permutation on the global register to get
-  // everything in shape:
-  lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2));
+  // an unaligned load is required here as there is no requirement
+  // on the alignment of input pointer 'from'
+  __m256i low_half = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
+  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
+  return pairs;
+}

 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet16f res = _mm512_undefined_ps();
-  return _mm512_insertf32x8(res, lane0, 0);
-  return _mm512_insertf32x8(res, lane1, 1);
-  return res;
-#else
-  Packet16f res = _mm512_undefined_ps();
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2);
-  res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3);
-  return res;
-#endif
-}
+// FIXME: this does not look optimal, better load a Packet4d and shuffle...
 // Loads 4 doubles from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3,
 // a3}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
-  Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from);
-  lane0 = _mm256_permute_pd(lane0, 3 << 2);
-
-  Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2));
-  lane1 = _mm256_permute_pd(lane1, 3 << 2);
-
-  Packet8d res = _mm512_undefined_pd();
-  res = _mm512_insertf64x4(res, lane0, 0);
-  return _mm512_insertf64x4(res, lane1, 1);
+ __m512d x = _mm512_setzero_pd();
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2);
+  x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3);
+  return x;
 }
+#else
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  __m512d x = _mm512_setzero_pd();
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2));
+  x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3));
+  return x;
+}
+#endif

 // Loads 4 floats from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
 template <>
 EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
-  Packet16f tmp = _mm512_undefined_ps();
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2);
-  tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3);
-  return tmp;
+  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0);
+  return _mm512_permutexvar_ps(scatter_mask, tmp);
 }
+
 // Loads 2 doubles from memory a returns the packet
 // {a0, a0  a0, a0, a1, a1, a1, a1}
 template <>
 EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
-  Packet8d tmp = _mm512_undefined_pd();
-  Packet2d tmp0 = _mm_load_pd1(from);
-  Packet2d tmp1 = _mm_load_pd1(from + 1);
-  Packet4d lane0 = _mm256_broadcastsd_pd(tmp0);
-  Packet4d lane1 = _mm256_broadcastsd_pd(tmp1);
+  __m256d lane0 = _mm256_set1_pd(*from);
+  __m256d lane1 = _mm256_set1_pd(*(from+1));
+  __m512d tmp = _mm512_undefined_pd();
  tmp = _mm512_insertf64x4(tmp, lane0, 0);
  return _mm512_insertf64x4(tmp, lane1, 1);
 }
@@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
                                                             Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
 template <>
 EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
                                                            Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);

@@ -586,7 +560,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
                                                         const Packet16f& from,
                                                         Index stride) {
-  Packet16i stride_vector = _mm512_set1_epi32(stride);
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
  Packet16i stride_multiplier =
      _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@@ -596,7 +570,7 @@ template <>
 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
                                                         const Packet8d& from,
                                                         Index stride) {
-  Packet8i stride_vector = _mm256_set1_epi32(stride);
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
  _mm512_i32scatter_pd(to, indices, from, 8);
@@ -660,8 +634,8 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
 // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                           \
-  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \
-      _mm512_extractf32x8_ps(INPUT, 1)
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0);                    \
+  __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
 #else
 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                \
  __m256 OUTPUT##_0 = _mm256_insertf128_ps(                     \
@@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {

 #ifdef EIGEN_VECTORIZE_AVX512DQ
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0);        \
-  OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
 #else
 #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_undefined_ps();                                           \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
 #endif
-template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f*
-vecs)
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f x = _mm256_add_ps(lane0, lane1);
+  return predux<Packet8f>(x);
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3));
+  sum = _mm_hadd_ps(sum, sum);
+  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
+  return _mm_cvtss_f32(sum);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d sum = _mm256_add_pd(lane0, lane1);
+  __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return padd(lane0, lane1);
+#else
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f sum0 = padd(lane0, lane2);
+  Packet4f sum1 = padd(lane1, lane3);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = padd(lane0, lane1);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
+//#ifdef EIGEN_VECTORIZE_AVX512DQ
+#if 0
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  Packet8f res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
+  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = pmul(lane0, lane1);
+  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
+  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_min_pd(lane0, lane1);
+  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
+  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
+  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  __m256d res = _mm256_max_pd(lane0, lane1);
+  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
+  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
 {
  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
@@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)

  return _mm512_insertf64x4(final_output, final_1, 1);
 }
-
-template <>
-EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
-  //#ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f sum = padd(lane0, lane1);
-  Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1));
-  tmp0 = _mm256_hadd_ps(tmp0, tmp0);
-  return pfirst(_mm256_hadd_ps(tmp0, tmp0));
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3));
-  sum = _mm_hadd_ps(sum, sum);
-  sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1));
-  return pfirst(sum);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d sum = padd(lane0, lane1);
-  Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1));
-  return pfirst(_mm256_hadd_pd(tmp0, tmp0));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
-#ifdef EIGEN_VECTORIZE_AVX512DQ
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  return padd(lane0, lane1);
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f sum0 = padd(lane0, lane2);
-  Packet4f sum1 = padd(lane1, lane3);
-  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = padd(lane0, lane1);
-  return res;
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) {
-//#ifdef EIGEN_VECTORIZE_AVX512DQ
-#if 0
-  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
-  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
-  Packet8f res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_ps(res, res, 1));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#else
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3));
-  res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-#endif
-}
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = pmul(lane0, lane1);
-  res = pmul(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3));
-  res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_min_pd(lane0, lane1);
-  res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) {
-  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
-  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
-  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
-  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
-  Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3));
-  res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2)));
-  return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1))));
-}
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
-  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
-  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
-  Packet4d res = _mm256_max_pd(lane0, lane1);
-  res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1));
-  return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
-}
-
-template <int Offset>
-struct palign_impl<Offset, Packet16f> {
-  static EIGEN_STRONG_INLINE void run(Packet16f& first,
-                                      const Packet16f& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
-          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
-          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
-
-      __m512i second_idx =
-          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
-                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
-                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
-                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
-
-      unsigned short mask = 0xFFFF;
-      mask <<= (16 - Offset);
-
-      first = _mm512_permutexvar_ps(first_idx, first);
-      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
-      first = _mm512_mask_blend_ps(mask, first, tmp);
-    }
-  }
-};
-template <int Offset>
-struct palign_impl<Offset, Packet8d> {
-  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
-    if (Offset != 0) {
-      __m512i first_idx = _mm512_set_epi32(
-          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
-          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
-
-      __m512i second_idx = _mm512_set_epi32(
-          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
-          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
-
-      unsigned char mask = 0xFF;
-      mask <<= (8 - Offset);
-
-      first = _mm512_permutexvar_pd(first_idx, first);
-      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
-      first = _mm512_mask_blend_pd(mask, first, tmp);
-    }
-  }
-};
+ 


 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/,
  return Packet16f();
 }
 template <>
-EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/,
-                                    const Packet8d& /*thenPacket*/,
-                                    const Packet8d& /*elsePacket*/) {
-  assert(false && "To be implemented");
-  return Packet8d();
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket,
+                                    const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = (ifPacket.select[0]   )
+             | (ifPacket.select[1]<<1)
+             | (ifPacket.select[2]<<2)
+             | (ifPacket.select[3]<<3)
+             | (ifPacket.select[4]<<4)
+             | (ifPacket.select[5]<<5)
+             | (ifPacket.select[6]<<6)
+             | (ifPacket.select[7]<<7);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
 }

+template<> EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+  return _mm512_cvttps_epi32(a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+  return _mm512_cvtepi32_ps(a);
+}
+
+template <int Offset>
+struct palign_impl<Offset, Packet16f> {
+  static EIGEN_STRONG_INLINE void run(Packet16f& first,
+                                      const Packet16f& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
+          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
+          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
+
+      __m512i second_idx =
+          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
+                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
+                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
+                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
+
+      unsigned short mask = 0xFFFF;
+      mask <<= (16 - Offset);
+
+      first = _mm512_permutexvar_ps(first_idx, first);
+      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
+      first = _mm512_mask_blend_ps(mask, first, tmp);
+    }
+  }
+};
+template <int Offset>
+struct palign_impl<Offset, Packet8d> {
+  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
+          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
+
+      __m512i second_idx = _mm512_set_epi32(
+          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
+          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
+
+      unsigned char mask = 0xFF;
+      mask <<= (8 - Offset);
+
+      first = _mm512_permutexvar_pd(first_idx, first);
+      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
+      first = _mm512_mask_blend_pd(mask, first, tmp);
+    }
+  }
+};
+
+
 } // end namespace internal

 } // end namespace Eigen
@@ -42,6 +42,7 @@
 #define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
 #endif

+#include <sstream>

 namespace Eigen {

@@ -230,7 +230,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2&
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
+  return Eigen::half(__float2half_rn(a1 + a2));
 #endif
 }

@@ -264,7 +264,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const ha
 #else
  float a1 = __low2float(a);
  float a2 = __high2float(a);
-  return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
+  return Eigen::half(__float2half_rn(a1 * a2));
 #endif
 }

@@ -768,7 +768,7 @@ struct scalar_sign_op<Scalar,true> {
    if (aa==real_type(0))
      return Scalar(0);
    aa = real_type(1)/aa;
-    return Scalar(real(a)*aa, imag(a)*aa );
+    return Scalar(a.real()*aa, a.imag()*aa );
  }
  //TODO
  //template <typename Packet>
@@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
    // registers. However once the latency is hidden there is no point in
    // increasing the value of k, so we'll cap it at 320 (value determined
    // experimentally).
-    const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
+    // To avoid that k vanishes, we make k_cache at least as big as kr
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
    if (k_cache < k) {
      k = k_cache - (k_cache % kr);
      eigen_internal_assert(k > 0);
@@ -648,8 +649,8 @@ public:
  // Vectorized path
  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
  {
-    dest.first  = pset1<RealPacket>(real(*b));
-    dest.second = pset1<RealPacket>(imag(*b));
+    dest.first  = pset1<RealPacket>(numext::real(*b));
+    dest.second = pset1<RealPacket>(numext::imag(*b));
  }
  
  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
@@ -20,8 +20,9 @@ template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
 template<
  typename Index,
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride>
 {
  typedef gebp_traits<RhsScalar,LhsScalar> Traits;

@@ -30,7 +31,7 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
    Index rows, Index cols, Index depth,
    const LhsScalar* lhs, Index lhsStride,
    const RhsScalar* rhs, Index rhsStride,
-    ResScalar* res, Index resStride,
+    ResScalar* res, Index resIncr, Index resStride,
    ResScalar alpha,
    level3_blocking<RhsScalar,LhsScalar>& blocking,
    GemmParallelInfo<Index>* info = 0)
@@ -39,8 +40,8 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
    general_matrix_matrix_product<Index,
      RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-      ColMajor>
-    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+      ColMajor,ResInnerStride>
+    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info);
  }
 };

@@ -49,8 +50,9 @@ struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLh
 template<
  typename Index,
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
+  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+  int ResInnerStride>
+struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride>
 {

 typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScala
 static void run(Index rows, Index cols, Index depth,
  const LhsScalar* _lhs, Index lhsStride,
  const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* _res, Index resStride,
+  ResScalar* _res, Index resIncr, Index resStride,
  ResScalar alpha,
  level3_blocking<LhsScalar,RhsScalar>& blocking,
  GemmParallelInfo<Index>* info = 0)
 {
  typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
  typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
-  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
-  LhsMapper lhs(_lhs,lhsStride);
-  RhsMapper rhs(_rhs,rhsStride);
-  ResMapper res(_res, resStride);
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
+  LhsMapper lhs(_lhs, lhsStride);
+  RhsMapper rhs(_rhs, rhsStride);
+  ResMapper res(_res, resStride, resIncr);

  Index kc = blocking.kc();                   // cache block size along the K direction
  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -226,7 +228,7 @@ struct gemm_functor
    Gemm::run(rows, cols, m_lhs.cols(),
              &m_lhs.coeffRef(row,0), m_lhs.outerStride(),
              &m_rhs.coeffRef(0,col), m_rhs.outerStride(),
-              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.innerStride(), m_dest.outerStride(),
              m_actualAlpha, m_blocking, info);
  }

@@ -428,7 +430,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::evalTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
    else
    {
      dst.setZero();
@@ -440,7 +442,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::addTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
    else
      scaleAndAddTo(dst,lhs, rhs, Scalar(1));
  }
@@ -449,7 +451,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
  {
    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
-      lazyproduct::subTo(dst, lhs, rhs);
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
    else
      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
  }
@@ -476,7 +478,8 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
        Index,
        LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
        RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
+        (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,
+        Dest::InnerStrideAtCompileTime>,
      ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor;

    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
@@ -25,51 +25,54 @@ namespace internal {
 **********************************************************************/

 // forward declarations (defined at the end of this file)
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
 struct tribb_kernel;
  
 /* Optimized matrix-matrix product evaluating only one triangular half */
 template <typename Index,
          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-                              int ResStorageOrder, int  UpLo, int Version = Specialized>
+                              int ResStorageOrder, int ResInnerStride, int  UpLo, int Version = Specialized>
 struct general_matrix_matrix_triangular_product;

 // as usual if the result is row major => we transpose the product
 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,UpLo,Version>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride,
                                      const ResScalar& alpha, level3_blocking<RhsScalar,LhsScalar>& blocking)
  {
    general_matrix_matrix_triangular_product<Index,
        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-        ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking);
+        ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower>
+      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking);
  }
 };

 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
+                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
+                          int ResInnerStride, int  UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
 {
  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride,
+                                      const RhsScalar* _rhs, Index rhsStride,
+                                      ResScalar* _res, Index resIncr, Index resStride,
                                      const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
  {
    typedef gebp_traits<LhsScalar,RhsScalar> Traits;

    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    LhsMapper lhs(_lhs,lhsStride);
    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);

    Index kc = blocking.kc();
    Index mc = (std::min)(size,blocking.mc());
@@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
-    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;

    for(Index k2=0; k2<depth; k2+=kc)
    {
@@ -110,8 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
               (std::min)(size,i2), alpha, -1, -1, 0, 0);

-
-        sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);

        if (UpLo==Upper)
        {
@@ -133,7 +135,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 //   while the triangular block overlapping the diagonal is evaluated into a
 //   small temporary buffer which is then accumulated into the result using a
 //   triangular traversal.
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int ResInnerStride, int UpLo>
 struct tribb_kernel
 {
  typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
@@ -142,11 +144,13 @@ struct tribb_kernel
  enum {
    BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
  };
-  void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
  {
-    typedef blas_data_mapper<ResScalar, Index, ColMajor> ResMapper;
-    ResMapper res(_res, resStride);
-    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
+    ResMapper res(_res, resStride, resIncr);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
+    gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;

    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer((internal::constructor_without_unaligned_array_assert()));

@@ -158,31 +162,32 @@ struct tribb_kernel
      const RhsScalar* actual_b = blockB+j*depth;

      if(UpLo==Upper)
-        gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
-
+        gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+      
      // selfadjoint micro block
      {
        Index i = j;
        buffer.setZero();
        // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0);
+        gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
+                     -1, -1, 0, 0);
+
        // 2 - triangular accumulation
        for(Index j1=0; j1<actualBlockSize; ++j1)
        {
-          ResScalar* r = &res(i, j + j1);
+          typename ResMapper::LinearMapper r = res.getLinearMapper(i,j+j1);
          for(Index i1=UpLo==Lower ? j1 : 0;
              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
-            r[i1] += buffer(i1,j1);
+            r(i1) += buffer(i1,j1);
        }
      }

      if(UpLo==Lower)
      {
        Index i = j+actualBlockSize;
-        gebp_kernel(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
-                    depth, actualBlockSize, alpha, -1, -1, 0, 0);
+        gebp_kernel1(res.getSubMapper(i, j), blockA+depth*i, actual_b, size-i, 
+                     depth, actualBlockSize, alpha, -1, -1, 0, 0);
      }
    }
  }
@@ -286,11 +291,12 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
    internal::general_matrix_matrix_triangular_product<Index,
      typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
      typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      IsRowMajor ? RowMajor : ColMajor, UpLo&(Lower|Upper)>
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)>
      ::run(size, depth,
            &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(),
            &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(),
-            mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? 1 : mat.outerStride() ) : 0), mat.outerStride(), actualAlpha, blocking);
+            mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0),
+            mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
  }
 };

@@ -40,7 +40,7 @@ namespace internal {
 template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
 struct general_matrix_matrix_rankupdate :
       general_matrix_matrix_triangular_product<
-         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
+         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};


 // try to go to BLAS specialization
@@ -48,9 +48,9 @@ struct general_matrix_matrix_rankupdate :
 template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
 struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
-               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
+               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \
  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
-                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
+                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) \
  { \
    if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \
      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
@@ -59,8 +59,8 @@ struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,Con
      general_matrix_matrix_triangular_product<Index, \
        Scalar, LhsStorageOrder, ConjugateLhs, \
        Scalar, RhsStorageOrder, ConjugateRhs, \
-        ColMajor, UpLo, BuiltIn> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \
+        ColMajor, 1, UpLo, BuiltIn> \
+      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \
    } \
  } \
 };
@@ -51,20 +51,22 @@ template< \
  typename Index, \
  int LhsStorageOrder, bool ConjugateLhs, \
  int RhsStorageOrder, bool ConjugateRhs> \
-struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
+struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1> \
 { \
 typedef gebp_traits<EIGTYPE,EIGTYPE> Traits; \
 \
 static void run(Index rows, Index cols, Index depth, \
  const EIGTYPE* _lhs, Index lhsStride, \
  const EIGTYPE* _rhs, Index rhsStride, \
-  EIGTYPE* res, Index resStride, \
+  EIGTYPE* res, Index resIncr, Index resStride, \
  EIGTYPE alpha, \
  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
  GemmParallelInfo<Index>* /*info = 0*/) \
 { \
  using std::conj; \
 \
+  EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+  eigen_assert(resIncr == 1); \
  char transa, transb; \
  BlasIndex m, n, k, lda, ldb, ldc; \
  const EIGTYPE *a, *b; \
@@ -17,7 +17,8 @@ namespace internal {
 /** \internal */
 inline void manage_multi_threading(Action action, int* v)
 {
-  static EIGEN_UNUSED int m_maxThreads = -1;
+  static int m_maxThreads = -1;
+  EIGEN_UNUSED_VARIABLE(m_maxThreads);

  if(action==SetAction)
  {
@@ -131,7 +132,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,

  ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);

-  #pragma omp parallel num_threads(threads)
+  int errorCount = 0;
+  #pragma omp parallel num_threads(threads) reduction(+: errorCount)
  {
    Index i = omp_get_thread_num();
    // Note that the actual number of threads might be lower than the number of request ones.
@@ -150,9 +152,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
    info[i].lhs_start = r0;
    info[i].lhs_length = actualBlockRows;

-    if(transpose) func(c0, actualBlockCols, 0, rows, info);
-    else          func(0, rows, c0, actualBlockCols, info);
+    EIGEN_TRY {
+      if(transpose) func(c0, actualBlockCols, 0, rows, info);
+      else          func(0, rows, c0, actualBlockCols, info);
+    } EIGEN_CATCH(...) {
+      ++errorCount;
+    }
  }
+  if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception());
 #endif
 }

@@ -277,20 +277,21 @@ struct symm_pack_rhs
 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
-          int ResStorageOrder>
+          int ResStorageOrder, int ResInnerStride>
 struct product_selfadjoint_matrix;

 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
-          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor,ResInnerStride>
 {

  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols,
    const Scalar* lhs, Index lhsStride,
    const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
+    Scalar* res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    product_selfadjoint_matrix<Scalar, Index,
@@ -298,33 +299,35 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,Co
      RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
-      ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha, blocking);
+      ColMajor,ResInnerStride>
+      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resIncr, resStride,  alpha, blocking);
  }
 };

 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>
 {

  static EIGEN_DONT_INLINE void run(
    Index rows, Index cols,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
    Index rows, Index cols,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = rows;
@@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    LhsMapper lhs(_lhs,lhsStride);
    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);

    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
 // matrix * selfadjoint product
 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>
 {

  static EIGEN_DONT_INLINE void run(
    Index rows, Index cols,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride>
+EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
    Index rows, Index cols,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    Index size = cols;
@@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
    typedef gebp_traits<Scalar,Scalar> Traits;

    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    LhsMapper lhs(_lhs,lhsStride);
-    ResMapper res(_res,resStride);
+    ResMapper res(_res,resStride, resIncr);

    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -503,12 +508,13 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,RhsMode,false>
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
      EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
-      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
+      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor,
+      Dest::InnerStrideAtCompileTime>
      ::run(
        lhs.rows(), rhs.cols(),                 // sizes
        &lhs.coeffRef(0,0), lhs.outerStride(),  // lhs info
        &rhs.coeffRef(0,0), rhs.outerStride(),  // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),  // result info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),  // result info
        actualAlpha, blocking                   // alpha
      );
  }
@@ -44,16 +44,18 @@ namespace internal {
 template <typename Index, \
          int LhsStorageOrder, bool ConjugateLhs, \
          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
 {\
 \
  static void run( \
    Index rows, Index cols, \
    const EIGTYPE* _lhs, Index lhsStride, \
    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
    char side='L', uplo='L'; \
    BlasIndex m, n, lda, ldb, ldc; \
    const EIGTYPE *a, *b; \
@@ -91,15 +93,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLh
 template <typename Index, \
          int LhsStorageOrder, bool ConjugateLhs, \
          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor,1> \
 {\
  static void run( \
    Index rows, Index cols, \
    const EIGTYPE* _lhs, Index lhsStride, \
    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
    char side='L', uplo='L'; \
    BlasIndex m, n, lda, ldb, ldc; \
    const EIGTYPE *a, *b; \
@@ -167,16 +171,18 @@ EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
 template <typename Index, \
          int LhsStorageOrder, bool ConjugateLhs, \
          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
 {\
 \
  static void run( \
    Index rows, Index cols, \
    const EIGTYPE* _lhs, Index lhsStride, \
    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
    char side='R', uplo='L'; \
    BlasIndex m, n, lda, ldb, ldc; \
    const EIGTYPE *a, *b; \
@@ -213,15 +219,17 @@ struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateL
 template <typename Index, \
          int LhsStorageOrder, bool ConjugateLhs, \
          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
+struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor,1> \
 {\
  static void run( \
    Index rows, Index cols, \
    const EIGTYPE* _lhs, Index lhsStride, \
    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
+    EIGTYPE* res,        Index resIncr, Index resStride, \
    EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) \
  { \
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+    eigen_assert(resIncr == 1); \
    char side='R', uplo='L'; \
    BlasIndex m, n, lda, ldb, ldc; \
    const EIGTYPE *a, *b; \
@@ -109,10 +109,10 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
    internal::general_matrix_matrix_triangular_product<Index,
      Scalar, OtherIsRowMajor ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
      Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      IsRowMajor ? RowMajor : ColMajor, UpLo>
+      IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo>
      ::run(size, depth,
            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha, blocking);
+            mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
  }
 };

@@ -45,22 +45,24 @@ template <typename Scalar, typename Index,
          int Mode, bool LhsIsTriangular,
          int LhsStorageOrder, bool ConjugateLhs,
          int RhsStorageOrder, bool ConjugateRhs,
-          int ResStorageOrder, int Version = Specialized>
+          int ResStorageOrder, int ResInnerStride,
+          int Version = Specialized>
 struct product_triangular_matrix_matrix;

 template <typename Scalar, typename Index,
          int Mode, bool LhsIsTriangular,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
                                           LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
+                                           RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version>
 {
  static EIGEN_STRONG_INLINE void run(
    Index rows, Index cols, Index depth,
    const Scalar* lhs, Index lhsStride,
    const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
+    Scalar* res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    product_triangular_matrix_matrix<Scalar, Index,
@@ -70,18 +72,19 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
      ConjugateRhs,
      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
      ConjugateLhs,
-      ColMajor>
-      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+      ColMajor, ResInnerStride>
+      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
  }
 };

 // implements col-major += alpha * op(triangular) * op(general)
 template <typename Scalar, typename Index, int Mode,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                           LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+                                           RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
 {
  
  typedef gebp_traits<Scalar,Scalar> Traits;
@@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
    Index _rows, Index _cols, Index _depth,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index, int Mode,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                                        LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
    Index _rows, Index _cols, Index _depth,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    // strip zeros
@@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
    
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    LhsMapper lhs(_lhs,lhsStride);
    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);

    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -235,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
 // implements col-major += alpha * op(general) * op(triangular)
 template <typename Scalar, typename Index, int Mode,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                        LhsStorageOrder,ConjugateLhs,
-                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
+                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>
 {
  typedef gebp_traits<Scalar,Scalar> Traits;
  enum {
@@ -251,20 +256,21 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    Index _rows, Index _cols, Index _depth,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
+    Scalar* res,        Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };

 template <typename Scalar, typename Index, int Mode,
          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
+          int RhsStorageOrder, bool ConjugateRhs,
+          int ResInnerStride, int Version>
 EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                                        LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
+                                                        RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
    Index _rows, Index _cols, Index _depth,
    const Scalar* _lhs, Index lhsStride,
    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,        Index resStride,
+    Scalar* _res,       Index resIncr, Index resStride,
    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
  {
    const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
@@ -276,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
    
    typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
-    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor> ResMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
    LhsMapper lhs(_lhs,lhsStride);
    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride);
+    ResMapper res(_res, resStride, resIncr);

    Index kc = blocking.kc();                   // cache block size along the K direction
    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -433,12 +439,12 @@ struct triangular_product_impl<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
      Mode, LhsIsTriangular,
      (internal::traits<ActualLhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
      (internal::traits<ActualRhsTypeCleaned>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
+      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>
      ::run(
        stripedRows, stripedCols, stripedDepth,   // sizes
        &lhs.coeffRef(0,0), lhs.outerStride(),    // lhs info
        &rhs.coeffRef(0,0), rhs.outerStride(),    // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),    // result info
+        &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(),    // result info
        actualAlpha, blocking
      );

@@ -46,7 +46,7 @@ template <typename Scalar, typename Index,
 struct product_triangular_matrix_matrix_trmm :
       product_triangular_matrix_matrix<Scalar,Index,Mode,
          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
-          RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
+          RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};


 // try to go to BLAS specialization
@@ -55,13 +55,15 @@ template <typename Index, int Mode, \
          int LhsStorageOrder, bool ConjugateLhs, \
          int RhsStorageOrder, bool ConjugateRhs> \
 struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
-           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
+           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \
  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
-    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \
+      eigen_assert(resIncr == 1); \
      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
-        _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+          _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
  } \
 };

@@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
     /* Most likely no benefit to call TRMM or GEMM from BLAS */ \
       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
-       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
-           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
     } else { \
     /* Make sense to call GEMM */ \
@@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
-       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
-       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \
 \
     /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
     } \
@@ -232,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
     /* Most likely no benefit to call TRMM or GEMM from BLAS*/ \
       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
-       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
-           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
+       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \
+           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \
       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
     } else { \
     /* Make sense to call GEMM */ \
@@ -241,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
       BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride()); \
       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth, 1, true); \
-       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
-       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
+       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor,1>::run( \
+       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \
 \
     /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \
     } \
@@ -15,48 +15,48 @@ namespace Eigen {
 namespace internal {

 // if the rhs is row major, let's transpose the product
-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor,OtherInnerStride>
 {
  static void run(
    Index size, Index cols,
    const Scalar*  tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
    level3_blocking<Scalar,Scalar>& blocking)
  {
    triangular_solve_matrix<
      Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
      (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
      NumTraits<Scalar>::IsComplex && Conjugate,
-      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
-      ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride>
+      ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
  }
 };

 /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
 */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder,int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
 {
  static EIGEN_DONT_INLINE void run(
    Index size, Index otherSize,
    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
    level3_blocking<Scalar,Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
    Index size, Index otherSize,
    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
    level3_blocking<Scalar,Scalar>& blocking)
  {
    Index cols = otherSize;

    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
-    typedef blas_data_mapper<Scalar, Index, ColMajor> OtherMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
    TriMapper tri(_tri, triStride);
-    OtherMapper other(_other, otherStride);
+    OtherMapper other(_other, otherStride, otherIncr);

    typedef gebp_traits<Scalar,Scalar> Traits;

@@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
              {
                Scalar b(0);
                const Scalar* l = &tri(i,s);
-                Scalar* r = &other(s,j);
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
                for (Index i3=0; i3<k; ++i3)
-                  b += conj(l[i3]) * r[i3];
+                  b += conj(l[i3]) * r(i3);

                other(i,j) = (other(i,j) - b)*a;
              }
              else
              {
                Scalar b = (other(i,j) *= a);
-                Scalar* r = &other(s,j);
-                const Scalar* l = &tri(s,i);
+                typename OtherMapper::LinearMapper r = other.getLinearMapper(s,j);
+                typename TriMapper::LinearMapper l = tri.getLinearMapper(s,i);
                for (Index i3=0;i3<rs;++i3)
-                  r[i3] -= b * conj(l[i3]);
+                  r(i3) -= b * conj(l(i3));
              }
            }
          }
@@ -185,28 +185,28 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju

 /* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
 */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>
 {
  static EIGEN_DONT_INLINE void run(
    Index size, Index otherSize,
    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
    level3_blocking<Scalar,Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,OtherInnerStride>::run(
    Index size, Index otherSize,
    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
+    Scalar* _other, Index otherIncr, Index otherStride,
    level3_blocking<Scalar,Scalar>& blocking)
  {
    Index rows = otherSize;
    typedef typename NumTraits<Scalar>::Real RealScalar;

-    typedef blas_data_mapper<Scalar, Index, ColMajor> LhsMapper;
+    typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
    typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
-    LhsMapper lhs(_other, otherStride);
+    LhsMapper lhs(_other, otherStride, otherIncr);
    RhsMapper rhs(_tri, triStride);

    typedef gebp_traits<Scalar,Scalar> Traits;
@@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
            {
              Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;

-              Scalar* r = &lhs(i2,j);
+              typename LhsMapper::LinearMapper r = lhs.getLinearMapper(i2,j);
              for (Index k3=0; k3<k; ++k3)
              {
                Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
-                Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
+                typename LhsMapper::LinearMapper a = lhs.getLinearMapper(i2,IsLower ? j+1+k3 : absolute_j2+k3);
                for (Index i=0; i<actual_mc; ++i)
-                  r[i] -= a[i] * b;
+                  r(i) -= a(i) * b;
              }
              if((Mode & UnitDiag)==0)
              {
                Scalar inv_rjj = RealScalar(1)/conj(rhs(j,j));
                for (Index i=0; i<actual_mc; ++i)
-                  r[i] *= inv_rjj;
+                  r(i) *= inv_rjj;
              }
            }

            // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, LhsMapper(_other+absolute_j2*otherStride+i2, otherStride),
+            pack_lhs_panel(blockA, lhs.getSubMapper(i2,absolute_j2),
                           actualPanelWidth, actual_mc,
                           actual_kc, j2);
          }
@@ -40,7 +40,7 @@ namespace internal {
 // implements LeftSide op(triangular)^-1 * general
 #define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
-struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
 { \
  enum { \
    IsLower = (Mode&Lower) == Lower, \
@@ -51,8 +51,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorage
  static void run( \
      Index size, Index otherSize, \
      const EIGTYPE* _tri, Index triStride, \
-      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
   BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb; \
   char side = 'L', uplo, diag='N', transa; \
   /* Set alpha_ */ \
@@ -99,7 +101,7 @@ EIGEN_BLAS_TRSM_L(scomplex, float,  ctrsm_)
 // implements RightSide general * op(triangular)^-1
 #define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC) \
 template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
-struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
+struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor,1> \
 { \
  enum { \
    IsLower = (Mode&Lower) == Lower, \
@@ -110,8 +112,10 @@ struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorag
  static void run( \
      Index size, Index otherSize, \
      const EIGTYPE* _tri, Index triStride, \
-      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
+      EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
  { \
+   EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \
+   eigen_assert(otherIncr == 1); \
   BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb; \
   char side = 'R', uplo, diag='N', transa; \
   /* Set alpha_ */ \
@@ -31,7 +31,7 @@ template<
  typename Index,
  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-  int ResStorageOrder>
+  int ResStorageOrder, int ResInnerStride>
 struct general_matrix_matrix_product;

 template<typename Index,
@@ -155,13 +155,21 @@ class BlasVectorMapper {
  Scalar* m_data;
 };

+template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
+class BlasLinearMapper;
+
 template<typename Scalar, typename Index, int AlignmentType>
-class BlasLinearMapper {
+class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
  public:
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename packet_traits<Scalar>::half HalfPacket;

-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
+    : m_data(data)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }

  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
    internal::prefetch(&operator()(i));
@@ -188,16 +196,25 @@ class BlasLinearMapper {
 };

 // Lightweight helper class to access matrix coefficients.
-template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
-class blas_data_mapper {
-  public:
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
+class blas_data_mapper;
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
+class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
+{
+public:
  typedef typename packet_traits<Scalar>::type Packet;
  typedef typename packet_traits<Scalar>::half HalfPacket;

  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
  typedef BlasVectorMapper<Scalar, Index> VectorMapper;

-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1)
+   : m_data(data), m_stride(stride)
+  {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr==1);
+  }

  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType>
  getSubMapper(Index i, Index j) const {
@@ -251,6 +268,90 @@ class blas_data_mapper {
  const Index m_stride;
 };

+// Implementation of non-natural increment (i.e. inner-stride != 1)
+// The exposed API is not complete yet compared to the Incr==1 case
+// because some features makes less sense in this case.
+template<typename Scalar, typename Index, int AlignmentType, int Incr>
+class BlasLinearMapper
+{
+public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
+    internal::prefetch(&operator()(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const {
+    return m_data[i*m_incr.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
+  }
+
+  template<typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
+    pscatter<Scalar, PacketType>(m_data + i*m_incr.value(), p, m_incr.value());
+  }
+
+protected:
+  Scalar *m_data;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
+template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,int Incr>
+class blas_data_mapper
+{
+public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE blas_data_mapper
+  getSubMapper(Index i, Index j) const {
+    return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC  EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j), m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return pgather<Scalar,PacketT>(&operator()(i, j),m_incr.value());
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template<typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+  const internal::variable_if_dynamic<Index,Incr> m_incr;
+};
+
 // lightweight helper class to access matrix coefficients (const version)
 template<typename Scalar, typename Index, int StorageOrder>
 class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
@@ -57,7 +57,10 @@
  #if __GNUC__>=6
    #pragma GCC diagnostic ignored "-Wignored-attributes"
  #endif
-
+  #if __GNUC__==7
+    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+    #pragma GCC diagnostic ignored "-Wattributes"
+  #endif
 #endif

 #if defined __NVCC__
@@ -80,4 +83,12 @@
  #pragma diag_suppress 2737
 #endif

+#else
+// warnings already disabled:
+# ifndef EIGEN_WARNINGS_DISABLED_2
+#  define EIGEN_WARNINGS_DISABLED_2
+# elif defined(EIGEN_INTERNAL_DEBUGGING)
+#  error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
+# endif
+
 #endif // not EIGEN_WARNINGS_DISABLED
@@ -47,11 +47,7 @@ template<typename T> struct NumTraits;
 template<typename Derived> struct EigenBase;
 template<typename Derived> class DenseBase;
 template<typename Derived> class PlainObjectBase;
-
-
-template<typename Derived,
-         int Level = internal::accessors_level<Derived>::value >
-class DenseCoeffsBase;
+template<typename Derived, int Level> class DenseCoeffsBase;

 template<typename _Scalar, int _Rows, int _Cols,
         int _Options = AutoAlign |
@@ -13,7 +13,7 @@

 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 7
+#define EIGEN_MINOR_VERSION 8

 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -380,7 +380,8 @@
 #if EIGEN_MAX_CPP_VER>=11 && \
    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
  || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
-  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)))
+  || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
+  || (EIGEN_COMP_MSVC >= 1900) )
  #define EIGEN_HAS_C99_MATH 1
 #else
  #define EIGEN_HAS_C99_MATH 0
@@ -396,6 +397,20 @@
 #endif
 #endif

+// Does the compiler support type_traits?
+// - full support of type traits was added only to GCC 5.1.0.
+// - 20150626 corresponds to the last release of 4.x libstdc++
+#ifndef EIGEN_HAS_TYPE_TRAITS
+#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \
+  && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \
+  && ((!defined(__GLIBCXX__))   || __GLIBCXX__ > 20150626)
+#define EIGEN_HAS_TYPE_TRAITS 1
+#define EIGEN_INCLUDE_TYPE_TRAITS
+#else
+#define EIGEN_HAS_TYPE_TRAITS 0
+#endif
+#endif
+
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
 #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
@@ -835,11 +850,48 @@ namespace Eigen {
 #endif


+/**
+ * \internal
+ * \brief Macro to explicitly define the default copy constructor.
+ * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
+#else
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
+#endif
+
+
+
 /** \internal
 * \brief Macro to manually inherit assignment operators.
 * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * With C++11 or later this also default-implements the copy-constructor
 */
-#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived)  \
+    EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+/** \internal
+ * \brief Macro to manually define default constructors and destructors.
+ * This is necessary when the copy constructor is re-defined.
+ * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
+ *
+ * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
+ */
+#if EIGEN_HAS_CXX11
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    EIGEN_DEVICE_FUNC Derived() = default; \
+    EIGEN_DEVICE_FUNC ~Derived() = default;
+#else
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
+    EIGEN_DEVICE_FUNC Derived() {}; \
+    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
+#endif
+
+
+
+

 /**
 * Just a side note. Commenting within defines works only by documenting
@@ -97,6 +97,9 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };

+#if EIGEN_HAS_CXX11
+using std::is_integral;
+#else
 template<typename T> struct is_integral        { enum { value = false }; };
 template<> struct is_integral<bool>            { enum { value = true }; };
 template<> struct is_integral<char>            { enum { value = true }; };
@@ -108,6 +111,11 @@ template<> struct is_integral<signed int>      { enum { value = true }; };
 template<> struct is_integral<unsigned int>    { enum { value = true }; };
 template<> struct is_integral<signed long>     { enum { value = true }; };
 template<> struct is_integral<unsigned long>   { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>  { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>{ enum { value = true }; };
+#endif
+#endif

 #if EIGEN_HAS_CXX11
 using std::make_unsigned;
@@ -531,4 +539,30 @@ bool not_equal_strict(const double& x,const double& y) { return std::not_equal_t

 } // end namespace Eigen

+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provides the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
+#endif
+
+
 #endif // EIGEN_META_H
@@ -1,4 +1,8 @@
-#ifdef EIGEN_WARNINGS_DISABLED
+#ifdef EIGEN_WARNINGS_DISABLED_2
+// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet!
+#  undef EIGEN_WARNINGS_DISABLED_2
+
+#elif defined(EIGEN_WARNINGS_DISABLED)
 #undef EIGEN_WARNINGS_DISABLED

 #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
@@ -34,6 +34,20 @@ inline IndexDest convert_index(const IndexSrc& idx) {
  return IndexDest(idx);
 }

+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template<typename T> struct is_valid_index_type
+{
+  enum { value =
+#if EIGEN_HAS_TYPE_TRAITS
+    internal::is_integral<T>::value || std::is_enum<T>::value
+#elif EIGEN_COMP_MSVC
+    internal::is_integral<T>::value || __is_enum(T)
+#else
+    // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index.
+    internal::is_convertible<T,Index>::value && !internal::is_same<T,float>::value && !is_same<T,double>::value
+#endif
+  };
+};

 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
@@ -90,6 +104,9 @@ class no_assignment_operator
 {
  private:
    no_assignment_operator& operator=(const no_assignment_operator&);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator)
 };

 /** \internal return the index type with the largest number of bits */
@@ -300,10 +300,13 @@ typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::compu
  ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1);
  ComplexScalar eival1 = (trace + disc) / RealScalar(2);
  ComplexScalar eival2 = (trace - disc) / RealScalar(2);
-
-  if(numext::norm1(eival1) > numext::norm1(eival2))
+  RealScalar eival1_norm = numext::norm1(eival1);
+  RealScalar eival2_norm = numext::norm1(eival2);
+  // A division by zero can only occur if eival1==eival2==0.
+  // In this case, det==0, and all we have to do is checking that eival2_norm!=0
+  if(eival1_norm > eival2_norm)
    eival2 = det / eival1;
-  else
+  else if(eival2_norm!=RealScalar(0))
    eival1 = det / eival2;

  // choose the eigenvalue closest to the bottom entry of the diagonal
@@ -236,7 +236,7 @@ template<typename _MatrixType> class RealSchur
    typedef Matrix<Scalar,3,1> Vector3s;

    Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu);
+    Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero);
    void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
    void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
    void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
@@ -302,12 +302,16 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
  Index totalIter = 0; // iteration count for whole matrix
  Scalar exshift(0);   // sum of exceptional shifts
  Scalar norm = computeNormOfT();
+  // sub-diagonal entries smaller than considerAsZero will be treated as zero.
+  // We use eps^2 to enable more precision in small eigenvalues.
+  Scalar considerAsZero = numext::maxi<Scalar>( norm * numext::abs2(NumTraits<Scalar>::epsilon()),
+                                                (std::numeric_limits<Scalar>::min)() );

  if(norm!=Scalar(0))
  {
    while (iu >= 0)
    {
-      Index il = findSmallSubdiagEntry(iu);
+      Index il = findSmallSubdiagEntry(iu,considerAsZero);

      // Check for convergence
      if (il == iu) // One root found
@@ -364,14 +368,17 @@ inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()

 /** \internal Look for single small sub-diagonal element and returns its index */
 template<typename MatrixType>
-inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero)
 {
  using std::abs;
  Index res = iu;
  while (res > 0)
  {
    Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
+
+    s = numext::maxi<Scalar>(s * NumTraits<Scalar>::epsilon(), considerAsZero);
+    
+    if (abs(m_matT.coeff(res,res-1)) <= s)
      break;
    res--;
  }
@@ -605,7 +605,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
  EIGEN_DEVICE_FUNC
  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
  {
-    using std::abs;
+    EIGEN_USING_STD_MATH(sqrt)
+    EIGEN_USING_STD_MATH(abs)
    Index i0;
    // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
    mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -616,8 +617,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
    VectorType c0, c1;
    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    if(n0>n1) res = c0/sqrt(n0);
+    else      res = c1/sqrt(n1);

    return true;
  }
@@ -169,20 +169,38 @@ class QuaternionBase : public RotationBase<Derived, 3>
  /** return the result vector of \a v through the rotation*/
  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;

+  #ifdef EIGEN_PARSED_BY_DOXYGEN
  /** \returns \c *this with scalar type casted to \a NewScalarType
    *
    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
    * then this function smartly returns a const reference to \c *this.
    */
  template<typename NewScalarType>
-  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const;
+
+  #else
+
+  template<typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline 
+  typename internal::enable_if<internal::is_same<Scalar,NewScalarType>::value,const Derived&>::type cast() const
  {
-    return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
+    return derived();
  }

+  template<typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline 
+  typename internal::enable_if<!internal::is_same<Scalar,NewScalarType>::value,Quaternion<NewScalarType> >::type cast() const
+  {
+    return Quaternion<NewScalarType>(coeffs().template cast<NewScalarType>());
+  }
+  #endif
+
 #ifdef EIGEN_QUATERNIONBASE_PLUGIN
 # include EIGEN_QUATERNIONBASE_PLUGIN
 #endif
+protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase)
 };

 /***************************************************************************
@@ -252,11 +252,11 @@ protected:
 public:

  /** Default constructor without initialization of the meaningful coefficients.
-    * If Mode==Affine, then the last row is set to [0 ... 0 1] */
+    * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
  EIGEN_DEVICE_FUNC inline Transform()
  {
    check_template_params();
-    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
+    internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix);
  }

  EIGEN_DEVICE_FUNC inline Transform(const Transform& other)
@@ -138,12 +138,6 @@ public:
  /** \returns the inverse translation (opposite) */
  Translation inverse() const { return Translation(-m_coeffs); }

-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
  static const Translation Identity() { return Translation(VectorType::Zero()); }

  /** \returns \c *this with scalar type casted to \a NewScalarType
@@ -87,7 +87,7 @@ struct umeyama_transform_matrix_type
 * \f{align*}
 *   T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
 * \f}
-* minimizing the resudiual above. This transformation is always returned as an 
+* minimizing the residual above. This transformation is always returned as an 
 * Eigen::Matrix.
 */
 template <typename Derived, typename OtherDerived>
@@ -519,7 +519,10 @@ void PartialPivLU<MatrixType>::compute()
  // the row permutation is stored as int indices, so just to be sure:
  eigen_assert(m_lu.rows()<NumTraits<int>::highest());

-  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  if(m_lu.cols()>0)
+    m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  else
+    m_l1_norm = RealScalar(0);

  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
  const Index size = m_lu.rows();
@@ -44,7 +44,7 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
  static void run(const MatrixType& mat, ResultType& result)
  {
    ActualMatrixType matrix(mat);
-    EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
+    const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000));

    // Load the full matrix into registers
    __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
@@ -139,7 +139,7 @@ struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
    iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));

    rd = _mm_shuffle_ps(rd,rd,0);
-    rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
+    rd = _mm_xor_ps(rd, p4f_sign_PNNP);

    //  iB = C*|B| - D*B#*A
    iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
@@ -192,7 +192,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
    void pardisoInit(int type)
    {
      m_type = type;
-      bool symmetric = std::abs(m_type) < 10;
+      EIGEN_USING_STD(abs);
+      bool symmetric = abs(m_type) < 10;
      m_iparm[0] = 1;   // No solver default
      m_iparm[1] = 2;   // use Metis for the ordering
      m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
@@ -768,6 +768,21 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
    // measure everything relative to shift
    Map<ArrayXr> diagShifted(m_workspace.data()+4*n, n);
    diagShifted = diag - shift;
+
+    if(k!=actual_n-1)
+    {
+      // check that after the shift, f(mid) is still negative:
+      RealScalar midShifted = (right - left) / RealScalar(2);
+      if(shift==right)
+        midShifted = -midShifted;
+      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+      if(fMidShifted>0)
+      {
+        // fMid was erroneous, fix it:
+        shift =  fMidShifted > Literal(0) ? left : right;
+        diagShifted = diag - shift;
+      }
+    }
    
    // initial guess
    RealScalar muPrev, muCur;
@@ -845,11 +860,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
      }
      
      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+      eigen_internal_assert(fLeft<Literal(0));

 #if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif

+
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
      if(!(fLeft * fRight<0))
      {
@@ -858,23 +875,37 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
      }
 #endif
      eigen_internal_assert(fLeft * fRight < Literal(0));
-      
-      while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
-      {
-        RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
-        fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
-        if (fLeft * fMid < Literal(0))
-        {
-          rightShifted = midShifted;
-        }
-        else
-        {
-          leftShifted = midShifted;
-          fLeft = fMid;
-        }
-      }

-      muCur = (leftShifted + rightShifted) / Literal(2);
+      if(fLeft<Literal(0))
+      {
+        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted)))
+        {
+          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
+          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+          eigen_internal_assert((numext::isfinite)(fMid));
+
+          if (fLeft * fMid < Literal(0))
+          {
+            rightShifted = midShifted;
+          }
+          else
+          {
+            leftShifted = midShifted;
+            fLeft = fMid;
+          }
+        }
+        muCur = (leftShifted + rightShifted) / Literal(2);
+      }
+      else 
+      {
+        // We have a problem as shifting on the left or right give either a positive or negative value
+        // at the middle of [left,right]...
+        // Instead fo abbording or entering an infinite loop,
+        // let's just use the middle as the estimated zero-crossing:
+        muCur = (right - left) * RealScalar(0.5);
+        if(shift == right)
+          muCur = -muCur;
+      }
    }
      
    singVals[k] = shift + muCur;
@@ -924,7 +955,7 @@ void BDCSVD<MatrixType>::perturbCol0
          Index j = i<k ? i : perm(l-1);
          prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-          if(i!=k && std::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
+          if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
            std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
                       << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n";
 #endif
@@ -934,7 +965,7 @@ void BDCSVD<MatrixType>::perturbCol0
      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
 #endif
      RealScalar tmp = sqrt(prod);
-      zhat(k) = col0(k) > Literal(0) ? tmp : -tmp;
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
    }
  }
 }
@@ -183,7 +183,7 @@ public:
    // this temporary is needed to workaround a MSVC issue
    Index diagSize = (std::max<Index>)(1,m_diagSize);
    return m_usePrescribedThreshold ? m_prescribedThreshold
-                                    : diagSize*NumTraits<Scalar>::epsilon();
+                                    : RealScalar(diagSize)*NumTraits<Scalar>::epsilon();
  }

  /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
@@ -608,7 +608,7 @@ public:
      }

      if(Base::m_diag.size()>0)
-        dest = Base::m_diag.asDiagonal().inverse() * dest;
+        dest = Base::m_diag.real().asDiagonal().inverse() * dest;

      if (Base::m_matrix.nonZeros()>0) // otherwise I==I
      {
@@ -156,7 +156,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
      /* the nonzero entry L(k,i) */
      Scalar l_ki;
      if(DoLDLT)
-        l_ki = yi / m_diag[i];
+        l_ki = yi / numext::real(m_diag[i]);
      else
        yi = l_ki = yi / Lx[Lp[i]];

@@ -28,7 +28,7 @@ class AmbiVector
    typedef typename NumTraits<Scalar>::Real RealScalar;

    explicit AmbiVector(Index size)
-      : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
+      : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
    {
      resize(size);
    }
@@ -147,7 +147,8 @@ template<typename _Scalar,typename _StorageIndex>
 void AmbiVector<_Scalar,_StorageIndex>::init(int mode)
 {
  m_mode = mode;
-  if (m_mode==IsSparse)
+  // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings
+  // if (m_mode==IsSparse)
  {
    m_llSize = 0;
    m_llStart = -1;
@@ -49,6 +49,7 @@ template<typename UnaryOp, typename ArgType>
 class unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::InnerIterator
    : public unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator
 {
+  protected:
    typedef typename XprType::Scalar Scalar;
    typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>::EvalIterator Base;
  public:
@@ -99,6 +100,7 @@ template<typename ViewOp, typename ArgType>
 class unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::InnerIterator
    : public unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator
 {
+  protected:
    typedef typename XprType::Scalar Scalar;
    typedef typename unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>::EvalIterator Base;
  public:
@@ -327,7 +327,8 @@ class SparseMatrix
          m_outerIndex[j] = newOuterIndex[j];
          m_innerNonZeros[j] = innerNNZ;
        }
-        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
+        if(m_outerSize>0)
+          m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
        
        m_data.resize(m_outerIndex[m_outerSize]);
      }
@@ -453,7 +453,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
      Index r = it.row();
      Index c = it.col();
      Index ip = perm ? perm[i] : i;
-      if(Mode==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
        count[StorageOrderMatch ? jp : ip]++;
      else if(r==c)
        count[ip]++;
@@ -486,7 +486,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
      StorageIndex jp = perm ? perm[j] : j;
      StorageIndex ip = perm ? perm[i] : i;
      
-      if(Mode==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
      {
        Index k = count[StorageOrderMatch ? jp : ip]++;
        dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
@@ -90,6 +90,7 @@ struct unary_evaluator<SparseView<ArgType>, IteratorBased>
    
    class InnerIterator : public EvalIterator
    {
+      protected:
        typedef typename XprType::Scalar Scalar;
      public:

@@ -43,8 +43,8 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
  * Simple example with key steps 
  * \code
  * VectorXd x(n), b(n);
-  * SparseMatrix<double, ColMajor> A;
-  * SparseLU<SparseMatrix<scalar, ColMajor>, COLAMDOrdering<Index> >   solver;
+  * SparseMatrix<double> A;
+  * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> >   solver;
  * // fill A and b;
  * // Compute the ordering permutation vector from the structural pattern of A
  * solver.analyzePattern(A); 
@@ -98,8 +98,10 @@ namespace std {
  { return deque_base::insert(position,x); }
  void insert(const_iterator position, size_type new_size, const value_type& x)
  { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2)
+#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1)
  // workaround GCC std::deque implementation
+  // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to
+  // fall-back to the default case
  void resize(size_type new_size, const value_type& x)
  {
    if (new_size < deque_base::size())
@@ -108,7 +110,7 @@ namespace std {
      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
  }
 #else
-  // either GCC 4.1 or non-GCC
+  // either non-GCC or GCC between 4.1 and 10.1
  // default implementation which should always work.
  void resize(size_type new_size, const value_type& x)
  {